## Importing Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline


from sklearn import metrics

# import warnings;
# warnings.filterwarnings('ignore');
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Importing Dataset

In [None]:
dataset=pd.read_csv('/kaggle/input/social-network-ads/Social_Network_Ads.csv')
dataset.head()

In [None]:
# shape of data
dataset.shape

In [None]:
# Information about data
dataset.info()

In [None]:
# Checking unique values
dataset.nunique()

Note :
1. In this dataset, 500 samples and 4 features.
2. 'Gender' and 'Purchased' are categorical varaible
3. 'User ID', 'Age' and 'EstimatedSalay' are numerical varaible.
4. 'User ID' feature have zero variance data and will drop it
5. 'Purchased' is the target variable and there are two class. 



In [None]:
# dropping 'User Id ' feature
dataset.drop(columns='User ID',axis=1,inplace=True)

In [None]:
# Percentage of missing values 
dataset.isnull().sum()/len(dataset)*100

In [None]:
# Visualization of missing values
sns.heatmap(dataset.isnull(),cmap='viridis');

In [None]:
dataset['Purchased'].value_counts()

In [None]:
# Checking data balance or not
dataset['Purchased'].value_counts(normalize=True)

In [None]:
sns.countplot(x='Purchased',data=dataset)

Note :
1. Above observation conclude that this dataset has no null values and it is a balance dataset.
2. In feature 'Purchased', class 0 --> 'Not purchased' and class 1 ---> 'Purchased'



In [None]:
# Statistical information about data
dataset.describe()

#  Checking Outlier

In [None]:
sns.histplot(dataset['Age'],kde=True)

In [None]:
sns.boxplot(y=dataset['EstimatedSalary'])

Note :
From above plot, we can observe that there are no outlier. ****

# EDA

In [None]:
sns.countplot(x='Gender',data=dataset)

In [None]:
dataset.Gender.value_counts()


In [None]:
################ ******************
dataset.groupby('Gender')['Purchased'].value_counts()/len(dataset)*100

In [None]:
fig,ax = plt.subplots(figsize=(8, 6))
# Pie chart
labels =["Female Not Purchased","Female Purchased",
           "Male Not Purchased","Male Purchased" ]

data_per = dataset.groupby('Gender')['Purchased'].value_counts()/len(dataset)*100
sizes = [round(i,2) for i in (list(data_per))]

explode = (0.05,0.05,0.05,0.05)
 
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode = explode)
#draw circle
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig.gca().add_artist(centre_circle)
# plt.tight_layout()

# plt.show()


Note : 
1. Above observation confirms that majority of purchaser are female.

In [None]:
dataset.groupby('Gender')['Purchased'].mean()

Note :
Above information conclude that the purchase rate of female customer is  higher than male customer

In [None]:
plt.subplots(figsize=(8, 6))
sns.boxplot(x='Gender',y='EstimatedSalary',data=dataset,hue='Purchased')

In [None]:
dataset.groupby(['Gender','Purchased'])['EstimatedSalary'].mean()

Note :
The above observation concludes that female and male both gender with higher salary grade make more purchases. 


In [None]:
sns.jointplot(x='Age',y='EstimatedSalary',data=dataset,hue='Purchased')

Note :
From above plot, we can see that most of the products are purchased by people those age between 40 to 60 and 
estimated salary range 900000 to 150000.

In [None]:
sns.swarmplot(x='Gender',y='Age',hue='Purchased',data=dataset)

In [None]:
dataset.groupby(['Gender','Purchased'])['Age'].mean()

Note:
Above plot confirms that amount of purchases are very less within age 20 to 30 and
most of the purchases are happening for age between 40 to 60

# Independent and Dependent feature Separating

In [None]:
X = dataset.iloc[:,:3]
Y = dataset.iloc[:,3]

#  Correlation between independent features

In [None]:
sns.heatmap(X.corr(), annot = True, cmap = 'RdYlGn')

Note :
Above plot shows that there are no correlation between independent features

# Processing categorical feature

In [None]:
X=pd.get_dummies(columns=['Gender'],data=X,drop_first=True)

In [None]:
X.head()

# Split the Dataset for Training & Testing


In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.25,random_state=101)

#  Feature Scaling

In [None]:
#Scaling the independent variables

standard_Scaler=StandardScaler()
X_train = standard_Scaler.fit_transform(X_train)  
X_test = standard_Scaler.transform(X_test)

# Model Building

# Logistic Regression Model

In [None]:
# Instantiating and fitting the model to training Dataset
log_reg=LogisticRegression(random_state=2)
log_reg.fit(X_train,y_train)

In [None]:
# Prediction for Test and Train Dataset

train_pred = log_reg.predict(X_train)
test_pred = log_reg.predict(X_test)


In [None]:
print("Logistic Regression Model Performance (Train Dataset):")
score=[]
accuracy = metrics.accuracy_score(y_train,train_pred)
precision =  metrics.precision_score(y_train,train_pred) 
recall =  metrics.recall_score(y_train,train_pred)   

print("------------------------------------")
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, Accuracy: {accuracy:.2f}')

In [None]:
print("Logistic Regression Model Performance (Test Dataset):")

accuracy = metrics.accuracy_score(y_test,test_pred)
precision =   metrics.precision_score(y_test,test_pred) 
recall =  metrics.recall_score(y_test,test_pred)  
score.append(accuracy)
print("------------------------------------\n")
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, Accuracy: {accuracy:.2f}')

# Decision Tree Classifier

In [None]:
dt_model = DecisionTreeClassifier(random_state=2,max_depth=5)
dt_model.fit(X_train, y_train)

In [None]:
test_pred = dt_model.predict(X_test)
train_pred = dt_model.predict(X_train)


In [None]:
print("Decision Tree Model Performance (Train Dataset):")

accuracy = metrics.accuracy_score(y_train,train_pred)
precision =  metrics.precision_score(y_train,train_pred) 
recall =  metrics.recall_score(y_train,train_pred)   
print("------------------------------------")
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, Accuracy: {accuracy:.2f}')

In [None]:
print("Decision Tree Model Performance (Test Dataset):")

accuracy = metrics.accuracy_score(y_test,test_pred)
precision =   metrics.precision_score(y_test,test_pred) 
recall =  metrics.recall_score(y_test,test_pred)    
score.append(accuracy)
print("------------------------------------\n")
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, Accuracy: {accuracy:.2f}')

## Random Forest Classifier


In [None]:
rf_model = RandomForestClassifier(n_estimators=50,random_state=1,max_depth=5)
rf_model.fit(X_train, y_train)

test_pred = rf_model.predict(X_test)
train_pred = rf_model.predict(X_train)


In [None]:
print("Random Forest Model Performance (Train Dataset):")

accuracy = metrics.accuracy_score(y_train,train_pred)
precision =  metrics.precision_score(y_train,train_pred) 
recall =  metrics.recall_score(y_train,train_pred)   
print("------------------------------------")
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, Accuracy: {accuracy:.2f}')

In [None]:
print("Random Forest Model Performance (Test Dataset):")

accuracy = metrics.accuracy_score(y_test,test_pred)
precision =   metrics.precision_score(y_test,test_pred) 
recall =  metrics.recall_score(y_test,test_pred)    
score.append(accuracy)
print("------------------------------------\n")
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, Accuracy: {accuracy:.2f}')

# K-Nearest Neighbours Classifier

In [None]:
error = []

# Calculating error for K values between 1 and 40
for i in range(1, 40):  
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(12, 6))  
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',  
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')  
plt.xlabel('K Value')  
plt.ylabel('Mean Error')  
plt.show()

In [None]:
classifier = KNeighborsClassifier(n_neighbors=15)  
classifier.fit(X_train, y_train)  

In [None]:
train_pred = classifier.predict(X_train)
test_pred = classifier.predict(X_test)  

In [None]:
print("K-Nearest Neighbours Model Performance (Train Dataset):")

accuracy = metrics.accuracy_score(y_train,train_pred)
precision =  metrics.precision_score(y_train,train_pred) 
recall =  metrics.recall_score(y_train,train_pred)   
print("------------------------------------")
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, Accuracy: {accuracy:.2f}')

In [None]:
print("K-Nearest Neighbours Model Performance (Test Dataset):")

accuracy = metrics.accuracy_score(y_test,test_pred)
precision =   metrics.precision_score(y_test,test_pred) 
recall =  metrics.recall_score(y_test,test_pred)    
score.append(accuracy)
print("------------------------------------\n")
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, Accuracy: {accuracy:.2f}')

# SVC Model

In [None]:
svc_model = SVC( kernel='rbf', gamma='scale')
svc_model.fit(X_train, y_train)
test_pred = svc_model.predict(X_test)
train_pred = svc_model.predict(X_train)

# conf_mat = metrics.confusion_matrix(y_test, y_pred)
# ax = sns.heatmap(conf_mat, annot=True, fmt=".0f", cbar=False)
# ax.set_xlabel('Predicted Labels')
# ax.set_ylabel('True Labels')
# plt.show()



In [None]:
print("SVC Model Performance (Train Dataset):")

accuracy = metrics.accuracy_score(y_train,train_pred)
precision =  metrics.precision_score(y_train,train_pred) 
recall =  metrics.recall_score(y_train,train_pred)   
print("------------------------------------")
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, Accuracy: {accuracy:.2f}')

In [None]:
print("SVC Model Performance (Test Dataset):")

accuracy = metrics.accuracy_score(y_test,test_pred)
precision =   metrics.precision_score(y_test,test_pred) 
recall =  metrics.recall_score(y_test,test_pred)    
score.append(accuracy)
print("------------------------------------\n")
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, Accuracy: {accuracy:.2f}')

# Compare Models Performance 

In [None]:
models = pd.DataFrame({
    'Model': ['Logistic','Decision Tree','Random Forest','KNN', 'SVC'],
             
    'Score': score
})

models.head()

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x='Model',y='Score',data=models)


# Conclusion :
Above plot confirms that Decision Tree, Random Forest, KNN, SVC model will provide 90% accuracy.