# Importing necessary libraries

In [98]:
import time
import sys
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.feature_selection import RFE
import numpy as np

<h1><b><i>First Task</i></b></h1>

For this dataset, we built two classifiers to predict Churn: one using Adaboost and one using random forest. <br>Used a suitable evaluation metric to compare the performance of the classifiers.

# Reading the dataset

In [99]:
df1 = pd.read_csv("customer-churn-data.csv")
df1.head()

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,TotalSpend,YearsAsCustomer,NumOfPurchases,AvgTransactionAmount,NumOfReturns,NumOfSupportQueries,SatisfactionScore,LastPurchaseDaysAgo,EmailOptIn,PromotionResponse,Churn
0,1,62,Other,45.15,5892.58,5,22,453.8,2,0,3,129,True,Responded,True
1,2,65,Male,79.51,9025.47,13,77,22.9,2,2,3,227,False,Responded,False
2,3,18,Male,29.19,618.83,13,71,50.53,5,2,2,283,False,Responded,True
3,4,21,Other,79.63,9110.3,3,33,411.83,5,3,5,226,True,Ignored,True
4,5,21,Other,77.66,5390.88,15,43,101.19,3,0,5,242,False,Unsubscribed,False


# Data Preprocessing
<b>
   <h4>
      This step includes the following:
   </h4>
</b>
<ol>
   <li>Encoding of Gender and PromotionResponse column</li>
   <li>Dropping irrelevant columns like CustomerID  etc.</li>
</ol>

In [100]:
Gender_new = pd.get_dummies(df1['Gender'], drop_first=True)
PromotionResponse_new = pd.get_dummies(df1['PromotionResponse'], drop_first=True)

df1 = pd.concat([df1, Gender_new, PromotionResponse_new], axis=1)
df1.head()

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,TotalSpend,YearsAsCustomer,NumOfPurchases,AvgTransactionAmount,NumOfReturns,NumOfSupportQueries,SatisfactionScore,LastPurchaseDaysAgo,EmailOptIn,PromotionResponse,Churn,Male,Other,Responded,Unsubscribed
0,1,62,Other,45.15,5892.58,5,22,453.8,2,0,3,129,True,Responded,True,0,1,1,0
1,2,65,Male,79.51,9025.47,13,77,22.9,2,2,3,227,False,Responded,False,1,0,1,0
2,3,18,Male,29.19,618.83,13,71,50.53,5,2,2,283,False,Responded,True,1,0,1,0
3,4,21,Other,79.63,9110.3,3,33,411.83,5,3,5,226,True,Ignored,True,0,1,0,0
4,5,21,Other,77.66,5390.88,15,43,101.19,3,0,5,242,False,Unsubscribed,False,0,1,0,1


In [101]:
X = df1.drop(['CustomerID', 'Gender', 'PromotionResponse', 'Churn'] , axis=1)
y = df1['Churn']

# AdaBoost Classifier
<b>
   <h4>
      We have done the following:
   </h4>
</b>
<ol>
   <li>Used hyperparameter tuning</li>
   <li>Fitted all the models using cross-validation and returned the best one</li>
</ol>

In [102]:
start_time = time.time()

parameters = {
    'n_estimators': range(10,101,10),
    'learning_rate': [(0.97 + x / 100) for x in range(0, 8)],
    'algorithm': ['SAMME', 'SAMME.R']
}

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=28)

best_result = {
    "test_accuracy": -1,
    "n_estimators": -1,
    "learning_rate": -1,
    "algorithm": "",
}

for n_estimators in parameters['n_estimators']:
  for learning_rate in parameters['learning_rate']:
    for algorithm in parameters['algorithm']:

      model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algorithm, random_state=42)

      scores = cross_val_score(model, X, y, cv=kf)
      test_accuracy = scores.mean()

      if test_accuracy > best_result['test_accuracy']:
        best_result['test_accuracy'] = test_accuracy
        best_result['n_estimators'] = n_estimators
        best_result['learning_rate'] = learning_rate
        best_result['algorithm'] = algorithm

end_time = time.time()

print(best_result)
print("Model size in bytes: " + str(sys.getsizeof(model)))
print(f"Model fit time: {end_time - start_time:.2f} seconds")

{'test_accuracy': 0.532, 'n_estimators': 60, 'learning_rate': 1.01, 'algorithm': 'SAMME.R'}
Model size in bytes: 48
Model fit time: 122.06 seconds


# RandomForest Classifier
<b>
   <h4>
      We have done the following:
   </h4>
</b>
<ol>
   <li> Used hyperparameter tuning</li>
   <li> Fitted all the models using cross-validation and returned the best one</li>
   <li> Also used out of bag error estimate</li>
</ol>

Used cross validation

In [103]:
start_time = time.time()

parameters = {
	'n_estimators': [15, 25, 35, 45, 55, 65, 75, 85, 95, 105],
	'max_features': ['sqrt', 'log2', None],
	'max_depth': [None, 2, 4, 6, 8, 10, 12, 14],
	'min_samples_split': [2, 4, 6],
}

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=28)

best_result = {
    "test_accuracy": -1,
    "n_estimators": -1,
    "max_features": "",
    "max_depth": -1,
    "min_samples_split": -1,
}

for n_estimators in parameters['n_estimators']:
  for max_features in parameters['max_features']:
    for max_depth in parameters['max_depth']:
      for min_samples_split in parameters['min_samples_split']:
        model = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)

        scores = cross_val_score(model, X, y, cv=kf)
        test_accuracy = scores.mean()

        if test_accuracy > best_result['test_accuracy']:
          best_result['test_accuracy'] = test_accuracy
          best_result['n_estimators'] = n_estimators
          best_result['max_features'] = max_features
          best_result['max_depth'] = max_depth
          best_result['min_samples_split'] = min_samples_split

end_time = time.time()

print(best_result)
print("Model size in bytes: " + str(sys.getsizeof(model)))
print(f"Model fit time: {end_time - start_time:.2f} seconds")

{'test_accuracy': 0.533, 'n_estimators': 15, 'max_features': 'sqrt', 'max_depth': 14, 'min_samples_split': 2}
Model size in bytes: 48
Model fit time: 742.21 seconds


Used out of bag error estimate

In [104]:
start_time = time.time()

parameters = {
	'n_estimators': [25, 35, 45, 55, 65, 75, 85, 95, 105],
	'max_features': ['sqrt', 'log2', None],
	'max_depth': [None, 2, 4, 6, 8, 10, 12, 14],
	'min_samples_split': [2, 4, 6],
}

best_result = {
    "test_accuracy": -1,
    "n_estimators": -1,
    "max_features": "",
    "max_depth": -1,
    "min_samples_split": -1,
}

for n_estimators in parameters['n_estimators']:
   for max_features in parameters['max_features']:
    for max_depth in parameters['max_depth']:
      for min_samples_split in parameters['min_samples_split']:
        model = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42, oob_score=True)

        model.fit(X,y)
        test_accuracy = model.oob_score_

        if test_accuracy > best_result['test_accuracy']:
          best_result['test_accuracy'] = test_accuracy
          best_result['n_estimators'] = n_estimators
          best_result['max_features'] = max_features
          best_result['max_depth'] = max_depth
          best_result['min_samples_split'] = min_samples_split

end_time = time.time()

print(best_result)
print("Model size in bytes: " + str(sys.getsizeof(model)))
print(f"Model fit time: {end_time - start_time:.2f} seconds")

{'test_accuracy': 0.562, 'n_estimators': 25, 'max_features': None, 'max_depth': 6, 'min_samples_split': 2}
Model size in bytes: 48
Model fit time: 177.09 seconds


<h1><b><i>Second Task</i></b></h1>

We did the following:
<ol>
  <li>
    Built two classifiers to predict Gender, one using a decision tree and one using a random forest.
  </li>
  <li>
    Built two models to predict Rating, one using linear regression and one using a decision tree regressor.
  </li>
</ol>

# Reading the dataset

In [None]:
df2 = pd.read_csv("supermarket-sales-data.csv")
df2.head()

# Data Preprocessing
<b>
   <h4>
      This step includes the following:
   </h4>
</b>
<ol>
   <li>Encoding of Gender, Branch etc. columns</li>
   <li>Dropping irrelevant columns</li>
</ol>

In [106]:
Branch_new = pd.get_dummies(df2['Branch'], drop_first=True)
CustomerType_new = pd.get_dummies(df2['CustomerType'], drop_first=True)
Gender_new = pd.get_dummies(df2['Gender'], drop_first=True)
ProductType_new = pd.get_dummies(df2['ProductType'], drop_first=True)
PaymentType_new = pd.get_dummies(df2['PaymentType'], drop_first=True)

df2 = pd.concat([df2, Branch_new, CustomerType_new, Gender_new, ProductType_new, PaymentType_new], axis =1)
df2.head()

Unnamed: 0,InvoiceID,Branch,CustomerType,Gender,ProductType,UnitPrice,Quantity,Tax,Total,PaymentType,...,C,Normal,Male,Fashion accessories,Food and beverages,Health and beauty,Home and lifestyle,Sports and travel,Credit card,UPI
0,750-67-8428,A,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,UPI,...,0,0,0,0,0,1,0,0,0,1
1,226-31-3081,C,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,Cash,...,1,1,0,0,0,0,0,0,0,0
2,631-41-3108,A,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,Credit card,...,0,1,1,0,0,0,1,0,1,0
3,123-19-1176,A,Member,Male,Health and beauty,58.22,8,23.288,489.048,UPI,...,0,0,1,0,0,1,0,0,0,1
4,373-73-7910,A,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,UPI,...,0,1,1,0,0,0,0,1,0,1


In [107]:
df2.corr()

  df2.corr()


Unnamed: 0,UnitPrice,Quantity,Tax,Total,Rating,B,C,Normal,Male,Fashion accessories,Food and beverages,Health and beauty,Home and lifestyle,Sports and travel,Credit card,UPI
UnitPrice,1.0,0.010778,0.633962,0.633962,-0.008778,-0.000342,0.024717,-0.020238,0.015445,0.026034,0.005836,-0.013072,-0.005854,0.022257,-0.032415,0.001865
Quantity,0.010778,1.0,0.70551,0.70551,-0.015815,-0.006773,0.017291,-0.016763,-0.074258,-0.070485,-0.006084,0.015709,0.027446,0.004912,0.006203,-0.006443
Tax,0.633962,0.70551,1.0,1.0,-0.036442,-0.008876,0.040176,-0.01967,-0.049451,-0.03385,-0.000551,0.001165,0.024276,0.016517,0.002852,-0.012244
Total,0.633962,0.70551,1.0,1.0,-0.036442,-0.008876,0.040176,-0.01967,-0.049451,-0.03385,-0.000551,0.001165,0.024276,0.016517,0.002852,-0.012244
Rating,-0.008778,-0.015815,-0.036442,-0.036442,1.0,-0.063462,0.04074,0.018889,0.0048,0.01531,0.037546,0.00754,-0.034351,-0.014658,0.011935,-0.010509
B,-0.000342,-0.006773,-0.008876,-0.008876,-0.063462,1.0,-0.49253,0.005657,0.018398,0.016121,-0.04351,0.014999,-0.018072,0.03931,0.026368,-0.006879
C,0.024717,0.017291,0.040176,0.040176,0.04074,-0.49253,1.0,-0.019903,-0.058243,0.036841,0.050161,0.01272,-0.043459,-0.054086,-0.018442,-0.032082
Normal,-0.020238,-0.016763,-0.01967,-0.01967,0.018889,0.005657,-0.019903,1.0,0.039996,0.016616,-0.036011,0.017559,-0.015494,-0.020608,-0.069946,0.049835
Male,0.015445,-0.074258,-0.049451,-0.049451,0.0048,0.018398,-0.058243,0.039996,1.0,-0.035669,-0.014909,0.067695,0.006328,-0.025984,-0.031061,0.054042
Fashion accessories,0.026034,-0.070485,-0.03385,-0.03385,0.01531,0.016121,0.036841,0.016616,-0.035669,1.0,-0.213579,-0.197014,-0.203093,-0.207608,0.003626,0.019743


In [108]:
X1 = df2.drop(['InvoiceID' ,'Tax' ,'Branch', 'CustomerType', 'Gender', 'ProductType', 'PaymentType', 'Male'] , axis=1)
y1 = df2['Gender']

In [109]:
X2 = df2.drop(['InvoiceID' ,'Tax','Branch', 'CustomerType', 'Gender', 'ProductType', 'PaymentType','Rating'] , axis=1)
y2 = df2['Rating']

# Decision Tree Classifier for Gender
<b>
   <h4>
      We have done the following:
   </h4>
</b>
<ol>
   <li> Used hyperparameter tuning</li>
   <li> Fitted all the models using cross-validation and returned the best one</li>
</ol>

In [110]:
start_time = time.time()

parameters = {
    'max_depth': [None, 2, 4, 6, 8, 10, 12, 14],
    'min_samples_split': [2, 4, 6],
    'max_features': ['sqrt', 'log2', None],
}

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=21)

best_result = {
    "test_accuracy": -1,
    "max_features": "",
    "max_depth": -1,
    "min_samples_split": -1,
}

for max_features in parameters['max_features']:
  for max_depth in parameters['max_depth']:
    for min_samples_split in parameters['min_samples_split']:
      model = DecisionTreeClassifier(max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)

      scores = cross_val_score(model, X1, y1, cv=kf)
      test_accuracy = scores.mean()

      if test_accuracy > best_result['test_accuracy']:
        best_result['test_accuracy'] = test_accuracy
        best_result['max_features'] = max_features
        best_result['max_depth'] = max_depth
        best_result['min_samples_split'] = min_samples_split

end_time = time.time()

print(best_result)
print("Model size in bytes: " + str(sys.getsizeof(model)))
print(f"Model fit time: {end_time - start_time:.2f} seconds")

{'test_accuracy': 0.5389999999999999, 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 2}
Model size in bytes: 48
Model fit time: 3.61 seconds


# Random Forest Classifier for Gender
<b>
   <h4>
      We have done the following:
   </h4>
</b>
<ol>
   <li> Used hyperparameter tuning</li>
   <li> Fitted all the models using cross-validation and returned the best one</li>
   <li> Also used out of bag error estimate</li>
</ol>

Used cross validation

In [111]:
start_time = time.time()

parameters = {
	'n_estimators': [15, 25, 35, 45, 55, 65, 75, 85, 95, 105],
	'max_features': ['sqrt', 'log2', None],
	'max_depth': [None, 2, 4, 6, 8, 10, 12, 14],
	'min_samples_split': [2, 4, 6],
}

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=21)

best_result = {
    "test_accuracy": -1,
    "n_estimators": -1,
    "max_features": "",
    "max_depth": -1,
    "min_samples_split": -1,
}

for n_estimators in parameters['n_estimators']:
   for max_features in parameters['max_features']:
    for max_depth in parameters['max_depth']:
      for min_samples_split in parameters['min_samples_split']:
        model = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)

        scores = cross_val_score(model, X1, y1, cv=kf)
        test_accuracy = scores.mean()

        if test_accuracy > best_result['test_accuracy']:
          best_result['test_accuracy'] = test_accuracy
          best_result['n_estimators'] = n_estimators
          best_result['max_features'] = max_features
          best_result['max_depth'] = max_depth
          best_result['min_samples_split'] = min_samples_split

end_time = time.time()

print(best_result)
print("Model size in bytes: " + str(sys.getsizeof(model)))
print(f"Model fit time: {end_time - start_time:.2f} seconds")

{'test_accuracy': 0.5349999999999999, 'n_estimators': 25, 'max_features': None, 'max_depth': 14, 'min_samples_split': 4}
Model size in bytes: 48
Model fit time: 599.80 seconds


Used out of error bag estimate

In [112]:
start_time = time.time()

parameters = {
	'n_estimators': [25, 35, 45, 55, 65, 75, 85, 95, 105],
	'max_features': ['sqrt', 'log2', None],
	'max_depth': [None, 2, 4, 6, 8, 10, 12, 14],
	'min_samples_split': [2, 4, 6],
}

best_result = {
    "test_accuracy": -1,
    "n_estimators": -1,
    "max_features": "",
    "max_depth": -1,
    "min_samples_split": -1,
}

for n_estimators in parameters['n_estimators']:
   for max_features in parameters['max_features']:
    for max_depth in parameters['max_depth']:
      for min_samples_split in parameters['min_samples_split']:
        model = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42, oob_score=True)

        model.fit(X1,y1)
        test_accuracy = model.oob_score_

        if test_accuracy > best_result['test_accuracy']:
          best_result['test_accuracy'] = test_accuracy
          best_result['n_estimators'] = n_estimators
          best_result['max_features'] = max_features
          best_result['max_depth'] = max_depth
          best_result['min_samples_split'] = min_samples_split

end_time = time.time()

print(best_result)
print("Model size in bytes: " + str(sys.getsizeof(model)))
print(f"Model fit time: {end_time - start_time:.2f} seconds")

{'test_accuracy': 0.554, 'n_estimators': 85, 'max_features': None, 'max_depth': None, 'min_samples_split': 2}
Model size in bytes: 48
Model fit time: 143.56 seconds


# Decision Tree Regressor for Rating
<b>
   <h4>
      We have done the following:
   </h4>
</b>
<ol>
   <li> Used Hyperparameter Training</li>
   <li> Also used Recursive Feature Elimination</li>
   <li> Fitted all the models using cross-validation and returned the best one</li>
</ol>

In [113]:
start_time = time.time()

parameters = {
    'max_depth': [None, 2, 4, 6, 8, 10, 12, 14],
    "max_features": [
        None,
        "sqrt",
        "log2",
    ],
    'min_samples_split': [2, 4, 6, 8]
}

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=28)

best_result = {
    "test_accuracy": -1,
    "max_features": "",
    "max_depth": -1,
    "min_samples_split": -1,
    "n_features_to_select": -1,
}

for max_features in parameters["max_features"]:
    for max_depth in parameters["max_depth"]:
        for min_samples_split in parameters["min_samples_split"]:
            for n_features_to_select in range(1, 16):
                model = DecisionTreeRegressor(
                    max_features=max_features,
                    max_depth=max_depth,
                    min_samples_split=min_samples_split,
                    random_state=69,
                )
                rfe = RFE(
                    estimator=model,
                    n_features_to_select=n_features_to_select,
                    step=1,
                )
                rfe.fit(X2, y2)

                selected_features = rfe.support_

                important_features = []
                for feature in rfe.support_.nonzero():
                    important_features.append(X2.columns[feature])

                imp = []
                for feature in important_features[0]:
                    imp.append(feature)

                scores = cross_val_score(model, X2[imp], y2, cv=kf)
                test_accuracy = scores.max()

                if test_accuracy > best_result["test_accuracy"]:
                    best_result["test_accuracy"] = test_accuracy
                    best_result["max_features"] = max_features
                    best_result["n_features_to_select"] = n_features_to_select
                    best_result["max_depth"] = max_depth
                    best_result["min_samples_split"] = min_samples_split


end_time = time.time()

print(best_result)
print("Model size in bytes: " + str(sys.getsizeof(model)))
print(f"Model fit time: {end_time - start_time:.2f} seconds")

{'test_accuracy': 0.030598649613079343, 'max_features': 'sqrt', 'max_depth': 4, 'min_samples_split': 4, 'n_features_to_select': 3}
Model size in bytes: 48
Model fit time: 74.24 seconds


# Linear Regressor for Rating
<b>
   <h4>
      We have done the following:
   </h4>
</b>
<ol>
   <li> Used Hyperparameter Training</li>
   <li> Also used Recursive Feature Elimination</li>
   <li> Fitted all the models using cross-validation and returned the best one</li>
</ol>

In [114]:
start_time = time.time()

parameters = {"fit_intercept": [True, False]}

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=28)

best_result = {
    "test_accuracy": -1,
    "fit_intercept": -1,
    "n_features_to_select": -1,
}

for fit_intercept in parameters["fit_intercept"]:
    for n_features_to_select in range(1, 16):
        model = LinearRegression(
            fit_intercept=fit_intercept,
        )
        rfe = RFE(
            estimator=model,
            n_features_to_select=n_features_to_select,
            step=1,
        )
        rfe.fit(X2, y2)

        selected_features = rfe.support_

        important_features = []
        for feature in rfe.support_.nonzero():
            important_features.append(X2.columns[feature])

        imp = []
        for feature in important_features[0]:
            imp.append(feature)

        scores = cross_val_score(model, X2[imp], y2, cv=kf)
        test_accuracy = scores.max()

        if test_accuracy > best_result["test_accuracy"]:
            best_result["test_accuracy"] = test_accuracy
            best_result["n_features_to_select"] = n_features_to_select
            best_result["fit_intercept"] = fit_intercept

end_time = time.time()

print(best_result)
print("Model size in bytes: " + str(sys.getsizeof(model)))
print(f"Model fit time: {end_time - start_time:.2f} seconds")

{'test_accuracy': 0.008848701011569093, 'fit_intercept': True, 'n_features_to_select': 1}
Model size in bytes: 48
Model fit time: 1.64 seconds
