In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import scale

In [2]:
data = pd.read_csv('winequality_red.csv')

In [3]:
data.std()  # overall data std is reduced to one by standardizing it

fixed acidity            1.741096
volatile acidity         0.179060
citric acid              0.194801
residual sugar           1.409928
chlorides                0.047065
free sulfur dioxide     10.460157
total sulfur dioxide    32.895324
density                  0.001887
pH                       0.154386
sulphates                0.169507
alcohol                  1.065668
quality                  0.807569
dtype: float64

In [4]:
scale(data)

array([[-0.52835961,  0.96187667, -1.39147228, ..., -0.57920652,
        -0.96024611, -0.78782264],
       [-0.29854743,  1.96744245, -1.39147228, ...,  0.1289504 ,
        -0.58477711, -0.78782264],
       [-0.29854743,  1.29706527, -1.18607043, ..., -0.04808883,
        -0.58477711, -0.78782264],
       ...,
       [-1.1603431 , -0.09955388, -0.72391627, ...,  0.54204194,
         0.54162988,  0.45084835],
       [-1.39015528,  0.65462046, -0.77526673, ...,  0.30598963,
        -0.20930812, -0.78782264],
       [-1.33270223, -1.21684919,  1.02199944, ...,  0.01092425,
         0.54162988,  0.45084835]])

In [5]:
#data.describe()  # overall statistical measures aren't computed after standardization

In [6]:
# Range Scaling
x = (data - data.min()) / (data.max() - data.min())
x  # all values are converted to 0-1 range

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.247788,0.397260,0.00,0.068493,0.106845,0.140845,0.098940,0.567548,0.606299,0.137725,0.153846,0.4
1,0.283186,0.520548,0.00,0.116438,0.143573,0.338028,0.215548,0.494126,0.362205,0.209581,0.215385,0.4
2,0.283186,0.438356,0.04,0.095890,0.133556,0.197183,0.169611,0.508811,0.409449,0.191617,0.215385,0.4
3,0.584071,0.109589,0.56,0.068493,0.105175,0.225352,0.190813,0.582232,0.330709,0.149701,0.215385,0.6
4,0.247788,0.397260,0.00,0.068493,0.106845,0.140845,0.098940,0.567548,0.606299,0.137725,0.153846,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,0.141593,0.328767,0.08,0.075342,0.130217,0.436620,0.134276,0.354626,0.559055,0.149701,0.323077,0.4
1595,0.115044,0.294521,0.10,0.089041,0.083472,0.535211,0.159011,0.370778,0.614173,0.257485,0.430769,0.6
1596,0.150442,0.267123,0.13,0.095890,0.106845,0.394366,0.120141,0.416300,0.535433,0.251497,0.400000,0.6
1597,0.115044,0.359589,0.12,0.075342,0.105175,0.436620,0.134276,0.396476,0.653543,0.227545,0.276923,0.4


In [7]:
# Range Compression
from sklearn.preprocessing import MinMaxScaler
default_scale = MinMaxScaler()
transformed = default_scale.fit_transform(data)
print(transformed)

[[0.24778761 0.39726027 0.         ... 0.13772455 0.15384615 0.4       ]
 [0.28318584 0.52054795 0.         ... 0.20958084 0.21538462 0.4       ]
 [0.28318584 0.43835616 0.04       ... 0.19161677 0.21538462 0.4       ]
 ...
 [0.15044248 0.26712329 0.13       ... 0.25149701 0.4        0.6       ]
 [0.11504425 0.35958904 0.12       ... 0.22754491 0.27692308 0.4       ]
 [0.12389381 0.13013699 0.47       ... 0.19760479 0.4        0.6       ]]


In [8]:
new_data = data.copy()

# fit_transform on this new_data
pred = default_scale.fit_transform(new_data)

# fit and transform separate function calls
default_scale.fit(data)
pred = default_scale.transform(new_data)

print(pred)

[[0.24778761 0.39726027 0.         ... 0.13772455 0.15384615 0.4       ]
 [0.28318584 0.52054795 0.         ... 0.20958084 0.21538462 0.4       ]
 [0.28318584 0.43835616 0.04       ... 0.19161677 0.21538462 0.4       ]
 ...
 [0.15044248 0.26712329 0.13       ... 0.25149701 0.4        0.6       ]
 [0.11504425 0.35958904 0.12       ... 0.22754491 0.27692308 0.4       ]
 [0.12389381 0.13013699 0.47       ... 0.19760479 0.4        0.6       ]]


In [9]:
# Escaping the Outliers

# To perform robust scaling with scikit learn to deal with outliers
from sklearn.preprocessing import RobustScaler
scaled_data = RobustScaler().fit_transform(data)
scaled_data

array([[-0.23809524,  0.72      , -0.78787879, ..., -0.33333333,
        -0.5       , -1.        ],
       [-0.04761905,  1.44      , -0.78787879, ...,  0.33333333,
        -0.25      , -1.        ],
       [-0.04761905,  0.96      , -0.66666667, ...,  0.16666667,
        -0.25      , -1.        ],
       ...,
       [-0.76190476, -0.04      , -0.39393939, ...,  0.72222222,
         0.5       ,  0.        ],
       [-0.95238095,  0.5       , -0.42424242, ...,  0.5       ,
         0.        , -1.        ],
       [-0.9047619 , -0.84      ,  0.63636364, ...,  0.22222222,
         0.5       ,  0.        ]])

In [10]:
# L2 Normalization -- Normalizer -- avoid all this lengthy computation
# for x in row : (x1 / ((x1^2+...+xn^2)^0.5)) + ... + (xn / ((x1^2+...+xn^2)^0.5))

from sklearn.preprocessing import Normalizer 
Normalizer().fit_transform(data)

array([[0.19347777, 0.01830195, 0.        , ..., 0.01464156, 0.24576906,
        0.13072822],
       [0.10698874, 0.01207052, 0.        , ..., 0.00932722, 0.13442175,
        0.06858252],
       [0.13494887, 0.01314886, 0.00069205, ..., 0.01124574, 0.16955114,
        0.08650569],
       ...,
       [0.1222319 , 0.00989496, 0.00252225, ..., 0.01455142, 0.21342078,
        0.11641133],
       [0.10524769, 0.01150589, 0.00214063, ..., 0.0126654 , 0.18195363,
        0.08919296],
       [0.12491328, 0.00645385, 0.00978487, ..., 0.01374046, 0.22900768,
        0.12491328]])

In [11]:
# Data Imputation/Replacement - tackling missing data - using SimpleImputer
dummy = pd.read_csv('IMDB-Movie-Data.csv')  # this dataset contains some null values 
from sklearn.impute import SimpleImputer
# 
dummy = dummy.drop('Genre', axis = 1)
dummy = dummy.drop('Title', axis = 1)
dummy = dummy.drop('Description', axis = 1)
dummy = dummy.drop('Director', axis = 1)
dummy = dummy.drop('Actors', axis = 1)
dummy = SimpleImputer(strategy = 'mean', fill_value = 1).fit_transform(dummy)  # strategy = mean/median/most_frequent/constant_value
dummy

array([[1.00000000e+00, 2.01400000e+03, 1.21000000e+02, ...,
        7.57074000e+05, 3.33130000e+02, 7.60000000e+01],
       [2.00000000e+00, 2.01200000e+03, 1.24000000e+02, ...,
        4.85820000e+05, 1.26460000e+02, 6.50000000e+01],
       [3.00000000e+00, 2.01600000e+03, 1.17000000e+02, ...,
        1.57606000e+05, 1.38120000e+02, 6.20000000e+01],
       ...,
       [9.98000000e+02, 2.00800000e+03, 9.80000000e+01, ...,
        7.06990000e+04, 5.80100000e+01, 5.00000000e+01],
       [9.99000000e+02, 2.01400000e+03, 9.30000000e+01, ...,
        4.88100000e+03, 8.29563761e+01, 2.20000000e+01],
       [1.00000000e+03, 2.01600000e+03, 8.70000000e+01, ...,
        1.24350000e+04, 1.96400000e+01, 1.10000000e+01]])

In [12]:
# PCA
from sklearn.decomposition import PCA
PCA(n_components=4).fit_transform(data)

array([[-13.22202658,  -2.03192212,  -1.18123474,  -0.47564207],
       [ 22.04025471,   4.40179054,  -0.35499069,  -0.2602393 ],
       [  7.16536169,  -2.50832073,  -0.62463767,  -0.27530638],
       ...,
       [ -3.43293096,  14.27427694,  -1.73227854,   0.21146278],
       [  1.13557385,  16.30769238,  -2.18955318,  -0.294478  ],
       [ -3.87592057,   3.13011173,  -1.84248483,   1.73878746]])

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = data.drop('quality', axis = 1)
y = data['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [14]:
from sklearn.datasets import load_breast_cancer
bc = load_breast_cancer()
bc.data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

# Data Modeling with Scikit Learn

In [15]:
# Linear Regression  -- already done above 
data
X_train, y_train, X_test, y_test 

(      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
 925             8.6             0.220         0.36             1.9      0.064   
 363            12.5             0.460         0.63             2.0      0.071   
 906             7.2             0.540         0.27             2.6      0.084   
 426             6.4             0.670         0.08             2.1      0.045   
 1251            7.5             0.580         0.14             2.2      0.077   
 ...             ...               ...          ...             ...        ...   
 1130            9.1             0.600         0.00             1.9      0.058   
 1294            8.2             0.635         0.10             2.1      0.073   
 860             7.2             0.620         0.06             2.7      0.077   
 1459            7.9             0.200         0.35             1.7      0.054   
 1126            5.8             0.290         0.26             1.7      0.063   
 
       free su

In [16]:
# Ridge Regression -> uses L2 norm for regularization
# For regularization -> alpha*(L2 norm of weights) + rmse val
from sklearn import linear_model
reg = linear_model.Ridge(alpha = 0.9)
reg.fit(X_train, y_train)

reg.coef_
reg.intercept_
r2 = reg.score(X_test, y_test)
r2

0.34759998500181366

In [17]:
# Using Cross-validation to choose alpha from a list of values
from sklearn import linear_model
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
reg = linear_model.RidgeCV(alphas = alphas)
reg.fit(X_train, y_train)

reg.score(X_test, y_test)

0.3497341896138322

In [18]:
# Lasso Regression - L1 norm of weights - uses fewer parameter values
# Sparse Regularization ->  α∣∣w∣∣1​ + i=1∑n​ (x i​ ⋅w−y i​ ) 2  

from sklearn import linear_model
reg = linear_model.Lasso(alpha = 0.1)
reg.fit(X_train, y_train)

reg.coef_
reg.intercept_
reg.score(X_test, y_test)

0.20887882853160988

In [19]:
# also supports cross validation like in ridge regression using LassoCV
from sklearn import linear_model
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
reg = linear_model.LassoCV(alphas = alpha)
reg.fit(X_train, y_train)

reg.coef_
reg.intercept_
reg.score(X_test, y_test)

0.20887882853160988

In [25]:
# Bayesian Regression -> alpha, lambda-precision factor, 
from sklearn import linear_model
# alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
reg = linear_model.BayesianRidge()  # don't take alpha/lambda parameter by default
reg.fit(X_train, y_train)

reg.coef_
reg.intercept_
reg.score(X_test, y_test)

0.3461509706055187

In [68]:
# Logistic Regression -> binary classification
reg = linear_model.LogisticRegression(max_iter = 10, multi_class = 'multinomial', solver = 'lbfgs', penalty = 'l2')
reg.fit(X_train, y_train)
reg_pred = reg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [43]:
# Decision Trees -> CLassification + Regression
from sklearn import tree
clf_tree = tree.DecisionTreeClassifier(max_depth = 8) # max_depth  -> no. of levels/layers
reg_tree = tree.DecisionTreeRegressor(max_depth = 5)
clf_tree.fit(X_train, y_train)
reg_tree.fit(X_train, y_train)
clf_tree.predict(X_test)
reg_tree.predict(X_test)

array([5.48888889, 5.09876543, 5.48888889, 5.35714286, 5.98019802,
       5.09876543, 5.29411765, 5.35714286, 6.27777778, 5.98019802,
       6.92307692, 5.48888889, 5.48888889, 5.48888889, 5.675     ,
       6.24137931, 5.35714286, 5.48888889, 6.94545455, 5.48888889,
       5.09090909, 5.48888889, 5.64864865, 6.05882353, 5.48888889,
       5.48888889, 6.05882353, 5.35714286, 4.94285714, 6.27777778,
       5.35714286, 5.35714286, 5.675     , 5.35714286, 5.48888889,
       5.09876543, 6.30882353, 5.98019802, 5.48888889, 6.24137931,
       5.35714286, 5.09876543, 6.27777778, 5.29411765, 5.675     ,
       5.48888889, 6.30882353, 5.675     , 5.64864865, 5.48888889,
       4.94285714, 5.48888889, 5.98019802, 6.94545455, 5.48888889,
       4.94285714, 5.98019802, 5.48888889, 5.64864865, 5.48888889,
       5.48888889, 6.30882353, 5.09876543, 5.09876543, 6.43478261,
       5.09876543, 6.92307692, 5.48888889, 6.94545455, 5.29411765,
       6.24137931, 4.94285714, 5.675     , 5.64864865, 6.05882

In [44]:
# Feature Extraction -> choosing best no. of features depending on the Gini Impurity(classification), MSE, MAE -> (Regression)

In [45]:
# Training and Testing
# from sklearn.model_selection import train_test_split()
# X_train, y_train, X_test, y_test = train_test_split(X, y, test_size = 0.25 )

In [55]:
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(reg, X, y, cv = 3)
print(cv_score)

[0.49718574 0.48592871 0.43714822]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [60]:
is_clf = True
for depth in range(3, 8):
    scores = cross_val_score(is_clf, X, y, depth, 5)
    mean = scores.mean()
    std_2 = 2 * scores.std()
    
print(depth, mean, std_2)

TypeError: cross_val_score() takes from 2 to 3 positional arguments but 5 were given

In [73]:
from sklearn import metrics
error = metrics.r2_score(X_test, reg_pred)
#error2 = metrics.mean_squared_error(X_test, reg_pred)

ValueError: y_true and y_pred have different number of output (11!=1)

In [74]:
# For Classification Error Calculation -> use accuracy_score
# For Regression Error Calculation -> use mse, mae, r2_score

In [81]:
reg = linear_model.BayesianRidge()
from sklearn.model_selection import GridSearchCV
params = {
  'alpha_1':[0.1,0.2,0.3],
  'alpha_2':[0.1,0.2,0.3]
}
reg_cv = GridSearchCV(reg, params, cv=5)
# predefined trairing and test sets
reg_cv.fit( X_train, y_train)
print(reg_cv.best_params_)

{'alpha_1': 0.3, 'alpha_2': 0.1}


# CLUSTERING

In [88]:
# Measuring Similarity -> cosine similarity -> dot product of L2 norm of two vectors, [-1, 1] ,, 0 -> no similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(data)

array([[1.        , 0.98479324, 0.99225115, ..., 0.95224146, 0.94713022,
        0.99291264],
       [0.98479324, 1.        , 0.99514183, ..., 0.9609342 , 0.9634775 ,
        0.99105198],
       [0.99225115, 0.99514183, 1.        , ..., 0.93997629, 0.93982573,
        0.98822294],
       ...,
       [0.95224146, 0.9609342 , 0.93997629, ..., 1.        , 0.99886991,
        0.97684733],
       [0.94713022, 0.9634775 , 0.93982573, ..., 0.99886991, 1.        ,
        0.9739885 ],
       [0.99291264, 0.99105198, 0.98822294, ..., 0.97684733, 0.9739885 ,
        1.        ]])

In [105]:
from sklearn.neighbors import NearestNeighbors  # returns distance of all neighbors from the input data observation
nbrs = NearestNeighbors(n_neighbors = 6) # default-val = 5
nbrs.fit(data)
dists, knbrs = nbrs.kneighbors(data)  # ~= predict function with other algorithms
knbrs

array([[   4,    0,  123,  262, 1380, 1379],
       [   1,  752, 1314, 1174, 1173, 1357],
       [   2,  196,  224,   58, 1361,   63],
       ...,
       [1592, 1596, 1542, 1593, 1565,  897],
       [1597, 1594, 1046,  670, 1273,  877],
       [1598,  571,  569,  995, 1335,   85]], dtype=int64)

In [143]:
# K-Means Clustering -> iterative approach , produces spherical clusters
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 3)
var2 = kmeans.fit(data)

# checking the cluster assignments 
kmeans.labels_  # cluster assignment for each data observation
kmeans.cluster_centers_  # final centroids
kmeans.predict(data)  # assigns random new clusters to observations 

array([0, 2, 2, ..., 2, 2, 2])

In [142]:
# Mini Batch kmeans clustering
from sklearn.cluster import MiniBatchKMeans
mini = MiniBatchKMeans(batch_size = 10, n_clusters = 3)
mini.fit(data)
mini.labels_  
mini.cluster_centers_  
mini.predict(data)



array([0, 1, 1, ..., 0, 0, 0])

In [152]:
# Hirerchical Clustering -> bottom-up (divisive), top-down (agglomerative) (like merge sort)
# Agglomerative Clustering
from sklearn.cluster import AgglomerativeClustering
agg = AgglomerativeClustering(n_clusters = 5)
agg.fit(data)
agg.labels_
# agg.cluster_centers_ -> don't contain this bcz it don't make use of any centroids

array([1, 2, 1, ..., 1, 2, 1], dtype=int64)