# SetUp

In [1]:
import sys
assert sys.version_info>=(3,7)
import pandas as pd

In [2]:
from packaging import version
import sklearn
assert version.parse(sklearn.__version__)>= version.parse("1.0.1")

In [3]:
import matplotlib.pyplot as plt
plt.rc('font',size=14)
plt.rc('axes',labelsize=14,titlesize=14)
plt.rc('legend',fontsize=14)
plt.rc('xtick',labelsize=10)
plt.rc('ytick',labelsize=10)

In [4]:
from pathlib import Path
IMAGES_PATH= Path() / "images" / "ensembles"
IMAGES_PATH.mkdir(parents=True,exist_ok=True)
def save_fig(fig_id,tight_layout=True,fig_extension="png",resolution=300):
    path=IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension,dpi=resolution)    
        
    


# Task 1: Customer Churn

# Description of the data set:
    The customer churn dataset dataset captures customer interactions with an online retail store.
    There are 15 atttributes in the data . Some of them are categorical and some of the are numerical
    attributes.Using these attributes we have built two classifier, to predict wheather a customer will churn
    or not with a particular business. So clearly we have used the churn attribute as the target variable and all
    other attributes as Input attributes(or variable).

## The data set

In [5]:
data1 = pd.read_csv("customer-churn-data.csv") 
data1

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,TotalSpend,YearsAsCustomer,NumOfPurchases,AvgTransactionAmount,NumOfReturns,NumOfSupportQueries,SatisfactionScore,LastPurchaseDaysAgo,EmailOptIn,PromotionResponse,Churn
0,1,62,Other,45.15,5892.58,5,22,453.80,2,0,3,129,True,Responded,True
1,2,65,Male,79.51,9025.47,13,77,22.90,2,2,3,227,False,Responded,False
2,3,18,Male,29.19,618.83,13,71,50.53,5,2,2,283,False,Responded,True
3,4,21,Other,79.63,9110.30,3,33,411.83,5,3,5,226,True,Ignored,True
4,5,21,Other,77.66,5390.88,15,43,101.19,3,0,5,242,False,Unsubscribed,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,54,Male,143.72,1089.09,2,29,77.75,0,3,2,88,True,Ignored,False
996,997,19,Male,164.19,3700.24,9,90,34.45,6,4,4,352,False,Responded,True
997,998,47,Female,113.31,705.85,17,69,187.37,7,3,1,172,True,Unsubscribed,False
998,999,23,Male,72.98,3891.60,7,31,483.80,1,2,5,55,False,Responded,True


In [6]:
from sklearn.preprocessing import LabelEncoder      

data1 = pd.read_csv("customer-churn-data.csv")     
data1=data1.drop(columns=["CustomerID"]) 
categorical_cols = ["Gender", "EmailOptIn", "PromotionResponse"]

label_encoder = LabelEncoder()
for col in categorical_cols:
    data1[col] = label_encoder.fit_transform(data1[col])


### modified data1


In [7]:
data1

Unnamed: 0,Age,Gender,AnnualIncome,TotalSpend,YearsAsCustomer,NumOfPurchases,AvgTransactionAmount,NumOfReturns,NumOfSupportQueries,SatisfactionScore,LastPurchaseDaysAgo,EmailOptIn,PromotionResponse,Churn
0,62,2,45.15,5892.58,5,22,453.80,2,0,3,129,1,1,True
1,65,1,79.51,9025.47,13,77,22.90,2,2,3,227,0,1,False
2,18,1,29.19,618.83,13,71,50.53,5,2,2,283,0,1,True
3,21,2,79.63,9110.30,3,33,411.83,5,3,5,226,1,0,True
4,21,2,77.66,5390.88,15,43,101.19,3,0,5,242,0,2,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,54,1,143.72,1089.09,2,29,77.75,0,3,2,88,1,0,False
996,19,1,164.19,3700.24,9,90,34.45,6,4,4,352,0,1,True
997,47,0,113.31,705.85,17,69,187.37,7,3,1,172,1,2,False
998,23,1,72.98,3891.60,7,31,483.80,1,2,5,55,0,1,True


In [8]:
from sklearn.model_selection import train_test_split
X1=data1.drop(columns=['Churn'])
y1=data1['Churn']
X1_train,X1_test,y1_train,y1_test=train_test_split(X1,y1,test_size=0.2,random_state=42)

## Random forest Classifier for the Customer Churn Data

In [9]:
import time
import sys
from sklearn.ensemble import RandomForestClassifier
start_time_randomforest=time.time()
rnd_clf=RandomForestClassifier(n_estimators=6000,max_leaf_nodes=10,n_jobs=-1,random_state=42)
rnd_clf.fit(X1_train,y1_train)
y1_pred_rf=rnd_clf.predict(X1_test)
time_needed_randomforest=time.time()-start_time_randomforest
space_needed_randomforest=sys.getsizeof(rnd_clf)
print("Time needed(time complexity) to train the classifier",time_needed_randomforest)
print("space used to train the classifier",space_needed_randomforest)


Time needed(time complexity) to train the classifier 23.21956968307495
space used to train the classifier 56


## Adaboost  classifier for the Customer Churn Data

In [10]:
from sklearn.ensemble import AdaBoostClassifier
start_time_adaboost=time.time()
adaboost_clf=AdaBoostClassifier(n_estimators=500,random_state=42)
adaboost_clf.fit(X1_train,y1_train)
y1_pred_adaboost=adaboost_clf.predict(X1_test)
time_needed_adaboost=time.time()-start_time_adaboost
space_needed_adaboost=sys.getsizeof(adaboost_clf)
print("Time needed for the Adaboost classifier",time_needed_adaboost)
print("Space needed for the adaboost classifier",space_needed_adaboost)

Time needed for the Adaboost classifier 3.4428281784057617
Space needed for the adaboost classifier 56


## Accuracy For Both The Classifiers

In [11]:
from sklearn.metrics import accuracy_score
adaboost_accuracy=accuracy_score(y1_test,y1_pred_adaboost)
random_forest_accuracy=accuracy_score(y1_test,y1_pred_rf)

In [12]:
print("Accuracy on Adaboost Classifier:", adaboost_accuracy)
print("Accuracy on Random Forest Classifier:",random_forest_accuracy)

Accuracy on Adaboost Classifier: 0.54
Accuracy on Random Forest Classifier: 0.56


## Conclusion

# Task 2: Supermarket Sales Data

## The data set

In [13]:
data2=pd.read_csv("supermarket-sales-data .csv")
data2

Unnamed: 0,InvoiceID,Branch,CustomerType,Gender,ProductType,UnitPrice,Quantity,Tax,Total,PaymentType,Rating
0,750-67-8428,A,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,UPI,9.1
1,226-31-3081,C,Normal,Female,Electronic accessories,15.28,5,3.8200,80.2200,Cash,9.6
2,631-41-3108,A,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,Credit card,7.4
3,123-19-1176,A,Member,Male,Health and beauty,58.22,8,23.2880,489.0480,UPI,8.4
4,373-73-7910,A,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,UPI,5.3
...,...,...,...,...,...,...,...,...,...,...,...
995,233-67-5758,C,Normal,Male,Health and beauty,40.35,1,2.0175,42.3675,UPI,6.2
996,303-96-2227,B,Normal,Female,Home and lifestyle,97.38,10,48.6900,1022.4900,UPI,4.4
997,727-02-1313,A,Member,Male,Food and beverages,31.84,1,1.5920,33.4320,Cash,7.7
998,347-56-2442,A,Normal,Male,Home and lifestyle,65.82,1,3.2910,69.1110,Cash,4.1


In [14]:
from sklearn.preprocessing import LabelEncoder

data2 = pd.read_csv("supermarket-sales-data .csv")
data2=data2.drop(columns=["InvoiceID"])

categorical_cols = ["Branch","CustomerType","ProductType","PaymentType"]

label_encoder = LabelEncoder()
for col in categorical_cols:
    data2[col] = label_encoder.fit_transform(data2[col])

## Modified Data

In [15]:
data2

Unnamed: 0,Branch,CustomerType,Gender,ProductType,UnitPrice,Quantity,Tax,Total,PaymentType,Rating
0,0,0,Female,3,74.69,7,26.1415,548.9715,2,9.1
1,2,1,Female,0,15.28,5,3.8200,80.2200,0,9.6
2,0,1,Male,4,46.33,7,16.2155,340.5255,1,7.4
3,0,0,Male,3,58.22,8,23.2880,489.0480,2,8.4
4,0,1,Male,5,86.31,7,30.2085,634.3785,2,5.3
...,...,...,...,...,...,...,...,...,...,...
995,2,1,Male,3,40.35,1,2.0175,42.3675,2,6.2
996,1,1,Female,4,97.38,10,48.6900,1022.4900,2,4.4
997,0,0,Male,2,31.84,1,1.5920,33.4320,0,7.7
998,0,1,Male,4,65.82,1,3.2910,69.1110,0,4.1


## Decision-Tree classifier to predict Gender

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


X2 = data2.drop(columns=["Gender"])
y2 = data2["Gender"] 


X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

start_time=time.time()
dt_classifier = DecisionTreeClassifier(max_leaf_nodes=10)
dt_classifier.fit(X2_train, y2_train)
dt_pred = dt_classifier.predict(X2_test)
time_decision_tree=time.time()-start_time
space_needed_decisiontree=sys.getsizeof(dt_classifier)
print("Time required for the decision-tree classifier",time_decision_tree)
print("Space required for the decision tree classifier",space_needed_decisiontree)

Time required for the decision-tree classifier 0.019819974899291992
Space required for the decision tree classifier 56


# Random-Forest Classifier to Predict Gender

In [17]:
rf_classifier2 = RandomForestClassifier(n_estimators=6000,max_leaf_nodes=10,n_jobs=-1,random_state=42)
start_time=time.time()
rf_classifier2.fit(X2_train, y2_train)
rf2_pred=rf_classifier2.predict(X2_test)
time_rf2=time.time()-start_time
space_needed_rf2=sys.getsizeof(rf_classifier2)
print("Time required for the random forest classifier",time_rf2)
print("Space required for the random forest classifier",space_needed_rf2)

Time required for the random forest classifier 21.253355741500854
Space required for the random forest classifier 56


# Accuracy of both the Classifier

In [18]:
rf2_accuracy = accuracy_score(y2_test, rf2_pred)
print("Random Forest Classifier Accuracy:", rf2_accuracy)
dt_accuracy = accuracy_score(y2_test, dt_pred)
print("Decision Tree Classifier Accuracy:", dt_accuracy)

Random Forest Classifier Accuracy: 0.525
Decision Tree Classifier Accuracy: 0.5


# Overall Conclusion

# Rating Prediction From The Supermarket-sales data set

In [19]:
data3=pd.read_csv("supermarket-sales-data .csv")
data3

Unnamed: 0,InvoiceID,Branch,CustomerType,Gender,ProductType,UnitPrice,Quantity,Tax,Total,PaymentType,Rating
0,750-67-8428,A,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,UPI,9.1
1,226-31-3081,C,Normal,Female,Electronic accessories,15.28,5,3.8200,80.2200,Cash,9.6
2,631-41-3108,A,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,Credit card,7.4
3,123-19-1176,A,Member,Male,Health and beauty,58.22,8,23.2880,489.0480,UPI,8.4
4,373-73-7910,A,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,UPI,5.3
...,...,...,...,...,...,...,...,...,...,...,...
995,233-67-5758,C,Normal,Male,Health and beauty,40.35,1,2.0175,42.3675,UPI,6.2
996,303-96-2227,B,Normal,Female,Home and lifestyle,97.38,10,48.6900,1022.4900,UPI,4.4
997,727-02-1313,A,Member,Male,Food and beverages,31.84,1,1.5920,33.4320,Cash,7.7
998,347-56-2442,A,Normal,Male,Home and lifestyle,65.82,1,3.2910,69.1110,Cash,4.1


In [20]:
data3=data3.drop(columns=["InvoiceID"])

categorical_cols = ["Branch","CustomerType","ProductType","PaymentType","Gender"]

label_encoder = LabelEncoder()
for col in categorical_cols:
    data3[col] = label_encoder.fit_transform(data3[col])

# Modified data3

In [21]:
data3

Unnamed: 0,Branch,CustomerType,Gender,ProductType,UnitPrice,Quantity,Tax,Total,PaymentType,Rating
0,0,0,0,3,74.69,7,26.1415,548.9715,2,9.1
1,2,1,0,0,15.28,5,3.8200,80.2200,0,9.6
2,0,1,1,4,46.33,7,16.2155,340.5255,1,7.4
3,0,0,1,3,58.22,8,23.2880,489.0480,2,8.4
4,0,1,1,5,86.31,7,30.2085,634.3785,2,5.3
...,...,...,...,...,...,...,...,...,...,...
995,2,1,1,3,40.35,1,2.0175,42.3675,2,6.2
996,1,1,0,4,97.38,10,48.6900,1022.4900,2,4.4
997,0,0,1,2,31.84,1,1.5920,33.4320,0,7.7
998,0,1,1,4,65.82,1,3.2910,69.1110,0,4.1


# Linear-Regression Model to Predict Rating

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
X3=data3.drop(columns=['Rating'])
y3=data3['Rating']
X3_train,X3_test,y3_train,y3_test=train_test_split(X3,y3,test_size=0.2,random_state=42)
lin_reg_model=LinearRegression()
start_time_linearregression=time.time()
lin_reg_model.fit(X3_train,y3_train)
y3_pred_linear_reg=lin_reg_model.predict(X3_test)
linear_reg_mse=mean_squared_error(y3_test,y3_pred_linear_reg)
time_linearregression=time.time()-start_time_linearregression
space_linearregression=sys.getsizeof(lin_reg_model)
print("Time required to train the linear regression model",time_linearregression)
print("Space required for the linear_regression model",space_linearregression)


Time required to train the linear regression model 0.0756082534790039
Space required for the linear_regression model 56


# Decision-Tree regressor to predict Rating

In [23]:
from sklearn.tree import DecisionTreeRegressor
decision_tree_model2=DecisionTreeRegressor()
start_time_decisiontree2=time.time()
decision_tree_model2.fit(X3_train,y3_train)
y3_pred_decision_tree2=decision_tree_model2.predict(X3_test)
time_decisiontree2=time.time()-start_time_decisiontree2
space_decisiontree2=sys.getsizeof(decision_tree_model2)
print("Time required for the decision-tree model",time_decisiontree2)
print("Space required for the decision-tree model",space_decisiontree2)

Time required for the decision-tree model 0.037895917892456055
Space required for the decision-tree model 56


# Acuuracy of both the Classifiers

In [24]:
decision_tree2_mse=mean_squared_error(y3_test,y3_pred_decision_tree2)
linear_reg_mse=mean_squared_error(y3_test,y3_pred_linear_reg)
print("Mean squared error of the decision tree regressor(CART) is:",decision_tree2_mse)
print("Mean Squared error of the linear regressor is:",linear_reg_mse)

Mean squared error of the decision tree regressor(CART) is: 6.59595
Mean Squared error of the linear regressor is: 3.1170468475043425


# Overall Conclusion