# **Project: Customer Segmentation and Personality Analysis**

## **IMPORT THE LIBRARIES**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from datetime import date
from datetime import datetime
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing


# **EDA**
#Read and Pre-process the data

In [None]:
data=pd.read_excel('C:\\Users\\DELL\\Downloads\\marketing_campaign1.xlsx')

In [None]:
data.head()


In [None]:
data.columns

## Data Dimensions

In [None]:
data.shape

## Non-Null Count and Data Types

In [None]:
data.info()

There are missing values in the column income

The data type of the column year_birth should be in datetime format, here it is given as int

## Converting column Year_Birth to DateTime format

In [None]:
data['Year_Birth'] = pd.to_datetime(data['Year_Birth'], format ='%Y')

##Missing values

In [None]:
data.isnull().sum()

###24 missing values in the column income

##Checking for skewness

In [None]:
data['Income'].skew()

###Since the data set is skewed, we should replace the missing values with median

##Imputing the missing values

In [None]:
data['Income'].fillna(data['Income'].median(), inplace = True)

In [None]:
data.info()

##Unique Values

In [None]:
data.nunique()

##Value Counts

In [None]:
columns=['Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'Complain']
for col in columns:
    print(f"Value counts for column '{col}':")
    print(data[col].value_counts())
    print()

##Duplicated Rows

In [None]:
data.duplicated().sum()

#Data Understanding

##Summary Statistics

In [None]:
data.describe()

##Feature Transformation

###Creating new columns like Age, Year joined, Month joined etc

In [None]:
from datetime import date
from datetime import datetime
data['Age'] = datetime.now().year - data["Year_Birth"].dt.year
data['Total_yearCust'] = datetime.now().year - (data['Dt_Customer'].dt.year)
data['Year_Joined'] = data['Dt_Customer'].dt.year
data['Month_Joined'] = data['Dt_Customer'].dt.strftime("%B")

In [None]:
#Total Expenses made
data['Total_Expenses'] = data['MntWines'] + data['MntFruits'] + data['MntMeatProducts'] + data['MntFishProducts'] + data['MntSweetProducts'] + data['MntGoldProds']
#Total accepted campaign
data['Total_Acc_Cmp'] = data['AcceptedCmp1'] + data['AcceptedCmp2'] + data['AcceptedCmp3'] + data['AcceptedCmp4'] + data['AcceptedCmp5'] + data['Response']
# Total number of purchases made
data['TotalNumPurchases'] = data['NumWebPurchases'] + data['NumCatalogPurchases'] + data['NumStorePurchases'] + data['NumDealsPurchases']
#Total number of children
data["children"]=data["Kidhome"]+data["Teenhome"]

##Correlation Analysis

In [None]:
data[['Income', 'Total_Expenses', 'Total_Acc_Cmp',"TotalNumPurchases",'Recency']].corr()

###Income with Total_Expenses and TotalNumPurchases are having a strong positive correlation.

In [None]:
data.head()

##Droping Unwanted Columns

In [None]:
data2=data.copy()
data2.drop(['ID' ,'Year_Birth','Dt_Customer','Z_CostContact','Z_Revenue'], axis=1, inplace=True)

In [None]:
data2.head()

In [None]:
data2.columns

# Data visualization

In [None]:
plt.figure(figsize=(12,12))
data2['Age'].value_counts().sort_index(ascending=False).plot(kind='barh')
plt.title('Age')

###We can see from the age graph that most of the customers are from the range of 43–60 years old.

In [None]:
plt.figure(figsize=(10,5))
sns.set(style='whitegrid')
ax = sns.histplot(data=data, x='Income', binwidth=10000, kde=True)
ax.set_title('Income')

### The above histogram gives us an idea about how the data is distributed across different values or bins.
### It allows us to visually identify whether the data is positively skewed (tail to the right), negatively skewed (tail to the left), or approximately symmetric.
### Here, we can see that, it is approximately symmetric, normally distributed

In [None]:
plt.figure(figsize=(10,5))
sns.set(style='whitegrid')
ax = sns.histplot(data=data2, x='Total_Expenses', kde=True,bins=50)
ax.set_title('Total_Expenses')

### Here, we can see that, Tota_Expenses is positively skewed(tail towards right side)

In [None]:
plt.figure(figsize=(10,5))
sns.set(style='whitegrid')
ax = sns.countplot(data=data, x='Education', saturation=1, alpha=0.9, palette='deep', order=data['Education'].value_counts().index)
ax.set_title('Education')
for p in ax.patches:
    number = '{}'.format(p.get_height().astype('int64'))
    ax.annotate(number, (p.get_x() + p.get_width()/2., p.get_height()), ha='center', va='center',
    xytext=(0,5), textcoords='offset points', color='black', fontweight='semibold', fontsize=9)

### Most of the people have completed their Graduations and only 54 people have did basic eduaction.

In [None]:
plt.figure(figsize=(10, 5))
sns.set(style='whitegrid')
ax = sns.countplot(data=data, x='Marital_Status', saturation=1, alpha=0.9, palette='deep', order=data['Marital_Status'].value_counts().index)
ax.set_title('Marital_Status')
for p in ax.patches:
  number = '{}'.format(p.get_height().astype('int64'))
  ax.annotate(number, (p.get_x() + p.get_width()/2., p.get_height()), ha='center', va='center',
  xytext=(0,5), textcoords='offset points', color='black', fontweight='semibold', fontsize=9)

### Married and Together people are the most, rest category count is less than 500.

In [None]:
plt.figure(figsize=(12,7))
ax = data[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']].sum().sort_values(ascending=True).plot(kind='barh')
plt.title('Expenses', pad=15, fontsize=18, fontweight='semibold')
rects = ax.patches
for rect in rects:
    x_value = rect.get_width()
    y_value = rect.get_y() + rect.get_height() / 2
    plt.annotate('{}'.format(x_value), (x_value, y_value), xytext=(-49, 0),
    textcoords='offset points', va='center', ha='left', color = 'white', fontsize=11, fontweight='semibold')


###Most of the expenses are made on the purchase of wine, while the least are spent on fruits.

In [None]:
plt.figure(figsize=(12,7))
ax = data[['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
       'NumStorePurchases']].sum().sort_values(ascending=True).plot(kind='barh')
plt.title('Total Purchases', pad=15, fontsize=18, fontweight='semibold')
rects = ax.patches
for rect in rects:
    x_value = rect.get_width()
    y_value = rect.get_y() + rect.get_height() / 2
    plt.annotate('{}'.format(x_value), (x_value, y_value), xytext=(-49, 0),
    textcoords='offset points', va='center', ha='left', color = 'white', fontsize=11, fontweight='semibold')


###About 12,970 purchases are made directly in stores, while 5,208 are made through deals.

In [None]:
ax = data[['Education', 'Total_Expenses']].groupby('Education').sum().sort_values(by='Total_Expenses', ascending=False).plot(kind='bar', figsize=(10,8), legend=None)
ax.set_title('Total Expenses by Education Level')
ax.set_ylabel('Total Expenses')

###Most of the expenses are made by households with graduates, while the least is made by households where individuals have only completed basic education.

In [None]:
plt.figure(figsize=(10, 4))
sns.set(style='whitegrid')
ax = sns.countplot(data=data2, x='children', saturation=1, alpha=0.9, palette='deep', order=data['children'].value_counts().index)
ax.set_title('Children')
for p in ax.patches:
    number = '{}'.format(p.get_height().astype('int64'))
    ax.annotate(number, (p.get_x() + p.get_width()/2., p.get_height()), ha='center', va='center',
    xytext=(0,5), textcoords='offset points', color='black', fontweight='semibold', fontsize=10)

### About 1128  households have atleast 1 child, about 638 households have no children and about 53 households have 3 children.

In [None]:
counts = data2['children'].value_counts()
plt.figure(figsize=(10, 5))
plt.pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('deep', len(counts)))  # Create the pie chart
plt.title('Children')
plt.axis('equal')
plt.show()

In [None]:
cx = data[['children', 'Total_Expenses']].groupby('children').sum().sort_values(by='Total_Expenses', ascending=False).plot(kind='bar', figsize=(10,8), legend=None)
cx.set_ylabel('Total Expenses')

### Larger Expenses are made by the households that have atleast 1 child.

In [None]:
mx = data[['Marital_Status', 'Total_Expenses']].groupby('Marital_Status').sum().sort_values(by='Total_Expenses', ascending=False).plot(kind='bar', figsize=(10,8), legend=None)
mx.set_title('Total Expenses made according to maritial status')
mx.set_ylabel('Total Expenses')

### Married and together people contribute most expenses.

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(data[['Income', 'Total_Expenses', 'Total_Acc_Cmp',"TotalNumPurchases",'Recency']].corr(), annot=True)

###The highest correlation is between income and total expenses, followed by total expenses and total accepted campaign.


##Outlier Detection

In [None]:
bx_col=['Income', 'MntWines', 'MntFruits', 'MntMeatProducts',
       'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
       "Age", 'Total_Expenses',"TotalNumPurchases"]

In [None]:
# Create boxplot visualizations for each numerical column
plt.figure(figsize=(8, 14))

for i, col in enumerate(bx_col, 1):
    plt.subplot(6, 3, i)
    plt.boxplot(data2[col])
    plt.title(col)

plt.tight_layout()
plt.show()


In [None]:
for col in bx_col:
    Q1 = data2[col].quantile(0.25)
    Q3 = data2[col].quantile(0.75)
    IQR = Q3-Q1
    LB = Q1-(1.5*IQR)
    UB = Q3+(1.5*IQR)
    for ind in data2[col].index:
        if data2.loc[ind,col]>UB:
            data2.loc[ind,col]=UB
        elif data2.loc[ind,col]<LB:
            data2.loc[ind,col]=LB
        else:
            pass
print("Outliers have been replaced with the threshold value.")

In [None]:
data2.head()

#Feature Engineering

In [None]:
data2["Education1"]=data2["Education"].replace({"Basic":"Basic","2n Cycle":"Postgraduation", "Graduation":"Graduation", "Master":"Postgraduation", "PhD":"Postgraduation"})
data2["Partner"]=data2["Marital_Status"].replace({"Married":"Yes", "Together":"Yes", "Absurd":"No", "Widow":"No", "YOLO":"No", "Divorced":"No", "Single":"No","Alone":"No"})
data2['Age_Group'] = pd.cut(x = data2['Age'], bins = [24, 44, 64, 100],labels = ['Adult','Middel Aged','Senior Citizen'])
data2.drop(["Marital_Status","Education"],axis=1, inplace=True)

In [None]:
data2["Age_Group"].value_counts()

In [None]:
data2["Education1"].value_counts()

##Encoding

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data2["Month_Joined"]=label_encoder.fit_transform(data2["Month_Joined"])
data2["Partner"]=label_encoder.fit_transform(data2["Partner"])
data2["Education1"]=label_encoder.fit_transform(data2["Education1"])
data2["Age_Group"]=label_encoder.fit_transform(data2["Age_Group"])


In [None]:
data2["Age_Group"].value_counts()

In [None]:
data2["Education1"].value_counts()

In [None]:
data2.tail()

#Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
num_cols = ['Income', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds','Age', 'Total_Expenses']
scaler = StandardScaler()
#scaler = MinMaxScaler()
scaled_data = data2.copy()
scaled_data[num_cols] = scaler.fit_transform(scaled_data[num_cols])

In [None]:
scaled_data.head()

#DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
# Perform DBSCAN clustering with different parameters
eps_values = [0.3, 0.4, 0.5, 0.6]
for eps in eps_values:
    for min_samples in range(2, 6):
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        dbscan_labels = dbscan.fit_predict(scaled_data)

        n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
        n_noise = list(dbscan_labels).count(-1)

        # Calculate silhouette score if there are more than 1 cluster and less than 7 clusters
        if 1 < n_clusters < 7:
            silhouette_avg = silhouette_score(scaled_data, dbscan_labels)
            print(f'eps: {eps}, min_samples: {min_samples}, Estimated number of clusters: {n_clusters}, Estimated number of noise points: {n_noise}, Silhouette Score: {silhouette_avg}')


### The silhouette score for DBSCAN is consistently low across different values of epsilon (eps) and min_samples, indicating poor clustering quality. The silhouette score is negative, suggesting that the clusters are overlapping, and the data points are poorly matched to their clusters.


#Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
clustering = AgglomerativeClustering(n_clusters=None, linkage='ward', affinity='euclidean', distance_threshold=4).fit(scaled_data)
cluster_labels = clustering.labels_

# Calculate silhouette score
silhouette_avg = silhouette_score(scaled_data, cluster_labels)
print(f'Silhouette Score: {silhouette_avg}')

In [None]:
linkage_methods = ['complete', 'average', 'single']
distance_metrics = ['manhattan']
for linkage in linkage_methods:
    for metric in distance_metrics:
        for threshold in range(2,5):
            # Perform hierarchical clustering
            clustering = AgglomerativeClustering(n_clusters=None, linkage=linkage, affinity=metric, distance_threshold=threshold).fit(scaled_data)
            cluster_labels = clustering.labels_

            # Check if the number of unique labels is greater than 1
            if len(set(cluster_labels)) > 1:
                # Calculate silhouette score
                silhouette_avg = silhouette_score(scaled_data, cluster_labels)
                print(f'Linkage: {linkage}, Distance Metric: {metric}, Distance Threshold: {threshold}, Silhouette Score: {silhouette_avg}')


### The silhouette scores for hierarchical clustering with different linkage methods and distance thresholds vary but generally tend to be higher than DBSCAN. The highest silhouette score which is around 0.20, which is achieved with linkage "ward" and a "eucledian" distance threshold of 4.



#K Means Clustering

##Elbow Method

In [None]:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
elbow_graph = KElbowVisualizer(KMeans(random_state=42), k=10)
elbow_graph.fit(scaled_data)
elbow_graph.show()

### Optimum no.of clusters is 5 by elbow curve with an inertia score(sum of squared distances between each data point and its nearest cluster centroid) around 93451

In [None]:
nc = range(1, 11)
l = []
for i in nc:
    kmeans_model = KMeans(n_clusters=i)
    kmeans_model.fit(scaled_data)
    l.append(kmeans_model.inertia_)
l

In [None]:
from sklearn.metrics import silhouette_score
kmeans_model=KMeans(n_clusters=5)
silhouette_score(scaled_data,kmeans_model.fit_predict(scaled_data))

In [None]:
kmeans_model=KMeans(n_clusters=4,init='k-means++',random_state = 42)
silhouette_score(scaled_data,kmeans_model.fit_predict(scaled_data))

In [None]:
data2['Clusters'] = kmeans_model.fit_predict(scaled_data)

In [None]:
nc=range(2,11)
l=[]
for i in nc:
  kmeans_model=KMeans(n_clusters=i)
  l.append(silhouette_score(scaled_data,kmeans_model.fit_predict(scaled_data)))

In [None]:
l

In [None]:
plt.figure(figsize=(10,4))
plt.style.use("seaborn-darkgrid")
plt.scatter(nc,l,s=100,color="black")
plt.plot(nc,l,linestyle="-.")
plt.xticks(nc)
plt.show()

### This Silhoutte score at k=4 (while not very high) suggests that K-Means is able to form somewhat distinct clusters, but there might still be some overlap or outliers. But this score is higher than the silhouette scores obtained from DBSCAN and hierarchical clustering.
### So we can conclude that k-means clustering with k=4  would likely provide better clustering results for this dataset.

In [None]:
selected_features = ['Income', 'Total_Expenses', 'TotalNumPurchases']
X_subset = scaled_data[selected_features]
# Apply K-Means clustering
kmeans = KMeans(n_clusters=4)
kmeans.fit(X_subset)
# Plot clusters in 3D
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
# Plot data points with cluster colors
sc = ax.scatter(X_subset['Income'], X_subset['Total_Expenses'], X_subset['TotalNumPurchases'],
                c=kmeans.labels_, cmap='viridis', s=50, alpha=0.7)

ax.set_xlabel('Income')
ax.set_ylabel('Total Expenses')
ax.set_zlabel('Total Purchases')
ax.set_title('K-Means Clustering without Dimensionality Reduction (3D)')

# Add legend for clusters
legend1 = ax.legend(*sc.legend_elements(), title="Clusters ")
ax.add_artist(legend1)

plt.show()


In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
# Appling PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(scaled_data)
# Apply K-Means clustering
kmeans = KMeans(n_clusters=4)
kmeans.fit(X_pca)
# Plot clusters in 3D
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(111, projection='3d')
# Plot data points
sc = ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=kmeans.labels_, cmap='viridis', s=50, alpha=0.7)

# Plot cluster centers
ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], kmeans.cluster_centers_[:, 2],
           marker='^', c='red', s=200, label='Centers                      ')
legend1 = ax.legend(*sc.legend_elements(), title="Clusters")
ax.add_artist(legend1)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('K-Means Clustering with PCA (3D)')
plt.legend()
plt.show()

In [None]:
data2["Clusters"].value_counts()

In [None]:
# count plot to see number of customers in each cluster
plt.figure(figsize=(12, 8))
sns.countplot(x='Clusters', data=data2,palette="deep")
plt.title('Customer in each clusters')
plt.show()

### Cluster 0 has the highest number of customers, followed by cluster 1.

In [None]:
cols1 = ['Kidhome', 'Teenhome', 'Partner', 'Education1','children',"Age_Group"]
fig, axes = plt.subplots(len(cols1), 1, figsize=(10, 5*len(cols1)))
for i, col in enumerate(cols1):
    sns.countplot(x='Clusters', data=data2, ax=axes[i], hue=col,palette='deep')
    axes[i].set_title(f'{col} vs Clusters')
plt.tight_layout()
plt.show()

In [None]:
cols2 = ['Income', 'Total_Expenses', 'Total_Acc_Cmp', 'TotalNumPurchases']
fig, axes = plt.subplots(len(cols2), 1, figsize=(10, 4*len(cols2)))
for i, col in enumerate(cols2):
    sns.barplot(x='Clusters', y=col, data=data2, ax=axes[i],palette="deep")
    axes[i].set_title(f'{col} vs Clusters')

plt.tight_layout()
plt.show()

In [None]:
cols3 = ['MntWines', 'MntFruits', 'MntMeatProducts',
         'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
         'Total_Expenses', 'Total_Acc_Cmp', 'TotalNumPurchases']
age_group_spending = data2.groupby([ 'Clusters','Age_Group'])[cols3].sum().reset_index()
fig, axes = plt.subplots(len(cols3), 1, figsize=(12, 30))
for i, col in enumerate(cols3):
    sns.barplot(x='Clusters', y=col, hue='Age_Group', data=age_group_spending, ax=axes[i], palette='deep')
    axes[i].set_title(f'{col} by different Age Groups')
    axes[i].set_xlabel('clusters')
    axes[i].set_ylabel(f'Total {col}')

plt.tight_layout()
plt.show()


In [None]:
cols4 = ["children", 'Education1']
for col in cols4:
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Clusters', y='Total_Expenses', data=data2, hue=col, palette="deep")
    plt.title(f'Total_Expenses by {col} and Clusters')
    plt.xlabel('Clusters')
    plt.ylabel('Total_Expenses')
    plt.legend(title=col)
    plt.tight_layout()

plt.show()


In [None]:
cols5 = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
cluster_means = data2.groupby('Clusters')[cols5].mean().reset_index()
# Melt the DataFrame for easier plotting
melted_df = cluster_means.melt(id_vars='Clusters', var_name='Product', value_name='Mean_Spend')
plt.figure(figsize=(12, 6))
sns.barplot(data=melted_df, x='Clusters', y='Mean_Spend', hue='Product', palette='deep')
plt.title('Average Spend on Products')
plt.xlabel('Cluster')
plt.ylabel('Average Spend')
plt.legend(title='Products', loc='upper right')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Plot the distribution of 'Year_Joined' by cluster using a bar plot
plt.figure(figsize=(12, 6))
sns.countplot(data=data2, x='Year_Joined', hue='Clusters', palette='deep')
plt.xlabel('Year')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Clusters', loc='upper left')
plt.tight_layout()
plt.show()


### Majority of the customers are joined in the year 2013.

##Obsevations:
Cluster 0:
* Highest no.of customers
* Most of the households have atleast one kid.
* Majority of the households doesnt have any teens
* With or with out partner.
* Most of them have atleast one or two child.
* Majority of them did thet graduation and postgraduation, very few did their basic education.
* Low income
* Low Expenditure
* Very less purchases
* Very less accepted campaign
* Expenses on different categories are done  more by middle aged people.
* Higher expenses are made by households having no children.
* Post graduate people does higher expenses.




Cluster 1:
* Second highest no. of customers
* Most of the households doesnt have any kids.
* Majority of the households  have atleast one teen.
* With or with out partner.
* Majority of them atleast one child.
* Majority of them did thet graduation and postgraduation
* Income below 60000
* Low Expenditure
* Less spending on wines, fruits etc.
* Less than 20 purchases
* Very less accepted campaign
* Expenses on different categories are done  more by middle aged people.
* Higher expenses are made by households having no children.
* Post graduate people does higher expenses.

Cluster 2:
* Most of the households doesnt have any kids.
* Majority of the households doent have any teens.
* With or with out partner.
* Well Educated
* Mostly Middle aged people, followed by equal no. senior citizens and adults.
* Cluster with the highest income
* High Expenditure
* High spending on wines
* High spending on meat products when compared to other clusters.
* More than 20 purchases
* Accepted alot of campaigns


Cluster 3:
* Most of the households doesnt have any kids.
* Majority of the households have atlesast one teen.
* With or with out partner.
* Well Educated
* Mostly Middle aged people and senior citizens.
* high income
* High Expenditure
* Highest spending on wines
* High spending on meat prods.
* More than 25 purchases
* Accepted alot of campaigns

## **Model Building**

In [None]:
data2.head()

In [None]:
print(data2.shape)
print(data2["Clusters"].value_counts())

In [None]:
x=data2.iloc[:,:-1]
y=data2.iloc[:,-1]

#Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=45, test_size=0.25)
x_train.shape, x_test.shape

## KNN Model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

# Initialize and train the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(x_train, y_train)

# Predictions on the test set
y_pred = knn_model.predict(x_test)

# Calculate training accuracy
train_accuracy1 = knn_model.score(x_train, y_train)
print("Training Accuracy:", train_accuracy1)

# Calculate testing accuracy
test_accuracy1 = knn_model.score(x_test, y_test)
print("Testing Accuracy:", test_accuracy1)

# Evaluate the model
cm1 = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm1)
print("\nClassification Report:\n", classification_report(y_test, y_pred))



#Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
# Initialize and train the Logistic Regression model
log_model = LogisticRegression(random_state=42)
log_model.fit(x_train, y_train)
# Predictions on the test set
y_pred = log_model.predict(x_test)
# Calculate training accuracy
train_accuracy2 = log_model.score(x_train, y_train)
print("Training Accuracy:", train_accuracy2)
# Calculate testing accuracy
test_accuracy2 = log_model.score(x_test, y_test)
print("Testing Accuracy:", test_accuracy2)
# Evaluate the model
cm2 = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


#Decision  Tree Model

In [None]:
from sklearn.tree import DecisionTreeClassifier
# Initialize and train the Decision Tree Classifier
model_DT = DecisionTreeClassifier(criterion='gini')
model_DT.fit(x_train, y_train)
# Predictions on the test set
y_pred = model_DT.predict(x_test)
# Calculate training accuracy
train_accuracy3 = model_DT.score(x_train, y_train)
print("Training Accuracy:", train_accuracy3)
# Calculate testing accuracy
test_accuracy3 = model_DT.score(x_test, y_test)
print("Testing Accuracy:", test_accuracy3)
# Evaluate the model
cm3 = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

#Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Initialize and train the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(x_train, y_train)
# Predictions on the test set
y_pred = rf_classifier.predict(x_test)
# Calculate training accuracy
train_accuracy4 = rf_classifier.score(x_train, y_train)
print("Training Accuracy:", train_accuracy4)
# Calculate testing accuracy
test_accuracy4 = rf_classifier.score(x_test, y_test)
print("Testing Accuracy:", test_accuracy4)
# Evaluate the model
cm4 = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

#XGBoost Model

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
# Initialize and train the XGBoost Classifier
xgb_classifier = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.2,reg_alpha=1, reg_lambda=1)
xgb_classifier.fit(x_train, y_train)
# Predictions on the test set
y_pred = xgb_classifier.predict(x_test)
# Predictions on the training set
y_train_pred = xgb_classifier.predict(x_train)
# Calculate training accuracy
train_accuracy5 = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy5)
# Perform cross-validation
cv_scores = cross_val_score(xgb_classifier, x_train, y_train, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())
# Predictions on the test set
y_pred = xgb_classifier.predict(x_test)
# Calculate testing accuracy
test_accuracy5 = accuracy_score(y_test, y_pred)
print("Testing Accuracy:", test_accuracy5)
# Generate confusion matrix
cm5 = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm5)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

#CatBoost Model

In [None]:
from catboost import CatBoostClassifier
# Create CatBoost classifier
catboost_model = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=2,loss_function='MultiClass', l2_leaf_reg=3)
# Train the model
catboost_model.fit(x_train, y_train,  eval_set=(x_test, y_test), early_stopping_rounds=10)

In [None]:
# Predict on test data
y_pred = catboost_model.predict(x_test)
# Predictions on the training set
y_train_pred = catboost_model.predict(x_train)
# Calculate training accuracy
train_accuracy6 = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy6)
# Calculate testing accuracy
test_accuracy6 = catboost_model.score(x_test, y_test)
print("Testing Accuracy:", test_accuracy6)
# Evaluate the model
cm6 = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


#Model Conclusion

In [None]:
data3 = {
   'Model': ["KNN", "Logistic Regression", "Decision Tree", "Random Forest", "XGBoost","CatBoost"],
    'Training Accuracy': [train_accuracy1, train_accuracy2, train_accuracy3, train_accuracy4, train_accuracy5, train_accuracy6],
    'Testing Accuracy': [test_accuracy1, test_accuracy2, test_accuracy3, test_accuracy4, test_accuracy5, test_accuracy6]
}

plt.figure(figsize=(10, 8))

# Position of bars on x-axis
x_pos = np.arange(len(data3['Model']))

# Plotting bars
bar_width = 0.35
bars1 = plt.bar(x_pos - 0.2, data3['Training Accuracy'], width=bar_width, label='Training Accuracy')
bars2 = plt.bar(x_pos + 0.2, data3['Testing Accuracy'], width=bar_width, label='Testing Accuracy')

# Add values on bars
for bars in [bars1, bars2]:
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), ha='center', va='bottom')

plt.title('Training and Testing Accuracy of Different Models')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.xticks(x_pos, data3['Model'], rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()

### CatBoost shows the best balance between training and testing accuracy, with training accuracy at 0.98 and testing accuracy at 0.95. This suggests that CatBoost is generalizing well and is likely the most robust model among those compared.

### XGBoost also shows a good balance. Despite the overfitting, the testing accuracy of 0.96 is very high, indicating that the model still performs well on unseen data. This suggests that, while the model is overfitting, it retains a strong predictive power.

#Deployment

In [None]:
import pickle
filename = 'XGBoostModel.pkl'
pickle.dump(xgb_classifier, open(filename,'wb'))