# **Exercise 1: Predicting Employee Attrition Using Logistic Regression**

In [48]:
from google.colab import files
uploaded = files.upload()

Saving HR_Employee_Attrition.csv to HR_Employee_Attrition (1).csv


In [78]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score

df = pd.read_csv(list(uploaded.keys())[0])
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [80]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [82]:
# Checking and handling missing values in the target variable
print(df['Attrition'].isna().sum())
df = df.dropna(subset=['Attrition'])


0


In [83]:
# Encoding categorical variables and standardize numerical features
categorical_features = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']
numerical_features = ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction','HourlyRate', 'JobInvolvement', 'JobLevel',
                      'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
                      'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
                      'YearsWithCurrManager']

In [84]:
# Define preprocessing steps
preprocessor = ColumnTransformer (transformers=[ ('num', StandardScaler(), numerical_features), ('cat', OneHotEncoder(), categorical_features) ] )

In [85]:
# Logistic regression model
model = Pipeline(steps = [('preprocessor', preprocessor), ('classifier', LogisticRegression()) ] )

In [86]:
# Split ting data into features and target variable
X = df.drop(columns='Attrition')
y = df['Attrition'].map({'Yes': 1, 'No': 0})

In [87]:
# Check for any NaN values in y and drop them
print(y.isna().sum())
y = y.dropna()

0


In [88]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [89]:
# Train the model
model.fit(X_train, y_train)

In [90]:
# Predictions
y_pred = model.predict(X_test)

In [92]:
# Model Evaluation
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print('Confusion Matrix:')
print(conf_matrix)

Precision: 0.6428571428571429
Recall: 0.46153846153846156
F1 Score: 0.537313432835821
Confusion Matrix:
[[245  10]
 [ 21  18]]


# **Exercise 2: Classifying Credit Card Fraud Using Decision Trees**

In [140]:
from google.colab import files
uploaded = files.upload()

Saving Credit_Card_Fraud_Detection.csv to Credit_Card_Fraud_Detection.csv


In [141]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix

df = pd.read_csv(list(uploaded.keys())[0])
print(df.head())

   Unnamed: 0  Customer_ID  A_1    A_2    A_3  A_4  A_5  A_6    A_7  A_8  A_9  \
0           0     15776156    1  22.08  11.46    2    4    4  1.585    0    0   
1           1     15739548    0  22.67   7.00    2    8    4  0.165    0    0   
2           2     15662854    0  29.58   1.75    1    4    4  1.250    0    0   
3           3     15687688    0  21.67  11.50    1    5    3  0.000    1    1   
4           4     15715750    1  20.17   8.17    2    6    4  1.960    1    1   

   A_10  A_11  A_12  A_13  A_14  class  
0     0     1     2   100  1213      0  
1     0     0     2   160     1      0  
2     0     1     2   280     1      0  
3    11     1     2     0     1      1  
4    14     0     2    60   159      1  


In [142]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   690 non-null    int64  
 1   Customer_ID  690 non-null    int64  
 2   A_1          690 non-null    int64  
 3   A_2          690 non-null    float64
 4   A_3          690 non-null    float64
 5   A_4          690 non-null    int64  
 6   A_5          690 non-null    int64  
 7   A_6          690 non-null    int64  
 8   A_7          690 non-null    float64
 9   A_8          690 non-null    int64  
 10  A_9          690 non-null    int64  
 11  A_10         690 non-null    int64  
 12  A_11         690 non-null    int64  
 13  A_12         690 non-null    int64  
 14  A_13         690 non-null    int64  
 15  A_14         690 non-null    int64  
 16  class        690 non-null    int64  
dtypes: float64(3), int64(14)
memory usage: 91.8 KB


Unnamed: 0.1,Unnamed: 0,Customer_ID,A_1,A_2,A_3,A_4,A_5,A_6,A_7,A_8,A_9,A_10,A_11,A_12,A_13,A_14,class
count,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0
mean,344.5,15690470.0,0.678261,31.568203,4.758725,1.766667,7.372464,4.692754,2.223406,0.523188,0.427536,2.4,0.457971,1.928986,184.014493,1018.385507,0.444928
std,199.330128,71506.47,0.467482,11.853273,4.978163,0.430063,3.683265,1.992316,3.346513,0.499824,0.49508,4.86294,0.498592,0.298813,172.159274,5210.102598,0.497318
min,0.0,15565710.0,0.0,13.75,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
25%,172.25,15631690.0,0.0,22.67,1.0,2.0,4.0,4.0,0.165,0.0,0.0,0.0,0.0,2.0,80.0,1.0,0.0
50%,344.5,15690160.0,1.0,28.625,2.75,2.0,8.0,4.0,1.0,1.0,0.0,0.0,0.0,2.0,160.0,6.0,0.0
75%,516.75,15751900.0,1.0,37.7075,7.2075,2.0,10.0,5.0,2.625,1.0,1.0,3.0,1.0,2.0,272.0,396.5,1.0
max,689.0,15815440.0,1.0,80.25,28.0,3.0,14.0,9.0,28.5,1.0,1.0,67.0,1.0,3.0,2000.0,100001.0,1.0


In [143]:
# Drop unnecessary columns
df = df.drop(columns=['Unnamed: 0', 'Customer_ID'])

# Define features and target
X = df.drop('class', axis=1)
y = df['class']

In [144]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Models
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [147]:
# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
roc_auc = roc_auc_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"ROC-AUC: {roc_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

ROC-AUC: 0.850236646382691
Confusion Matrix:
[[78  9]
 [10 41]]


# **Exercise 3: Predicting Heart Disease Using Logistic Regression**

In [95]:
from google.colab import files
uploaded = files.upload()

Saving heart.csv to heart.csv


In [102]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

df = pd.read_csv(list(uploaded.keys())[0])
print(df.head())
df.info()
df.describe()

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [103]:
# Define features and target
X = df.drop('target', axis=1)
y = df['target']

In [104]:
# Numerical and categorical features
numeric_feature = X.select_dtypes(include=['int64', 'float64']).columns
categorical_feature = []

In [106]:
preprocessor = ColumnTransformer (transformers=[ ('num', StandardScaler(), numeric_feature)] )

In [107]:
# Logistic regression model
pipeline = Pipeline(steps = [('preprocessor', preprocessor), ('classifier', LogisticRegression(max_iter=1000)) ] )

In [108]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [110]:
# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"ROC-AUC: {roc_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.7951219512195122
ROC-AUC: 0.7947363411383972
Confusion Matrix:
[[73 29]
 [13 90]]


# **Exercise 4: Classifying Emails as Spam Using Decision Trees**

In [111]:
from google.colab import files
uploaded = files.upload()

Saving email_spam.csv to email_spam.csv


In [121]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv(list(uploaded.keys())[0], encoding='latin1')
print(df.head())
df.info()
df.describe()

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
m

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [122]:
# Dropping unnecessary columns
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

# Renaming columns for convenience
df.rename(columns={'v1': 'label', 'v2': 'message'}, inplace=True)

# Encoding the labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Check for missing values
print(df.isnull().sum())

# Split the data
X = df['message']
y = df['label']

label      0
message    0
dtype: int64


In [123]:
# Text vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X = tfidf.fit_transform(X).toarray()

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")

Precision: 0.9014084507042254
Recall: 0.8533333333333334
F1 Score: 0.8767123287671234
Confusion Matrix:
[[951  14]
 [ 22 128]]


# **Exercise 5: Predicting Customer Satisfaction Using Logistic Regression**

In [126]:
from google.colab import files
uploaded = files.upload()

Saving restaurant_customer_satisfaction.csv to restaurant_customer_satisfaction.csv


In [130]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

df = pd.read_csv(list(uploaded.keys())[0], encoding='utf-8')
df.head()

Unnamed: 0,CustomerID,Age,Gender,Income,VisitFrequency,AverageSpend,PreferredCuisine,TimeOfVisit,GroupSize,DiningOccasion,MealType,OnlineReservation,DeliveryOrder,LoyaltyProgramMember,WaitTime,ServiceRating,FoodRating,AmbianceRating,HighSatisfaction
0,654,35,Male,83380,Weekly,27.829142,Chinese,Breakfast,3,Business,Takeaway,0,1,1,43.523929,2,5,4,0
1,655,19,Male,43623,Rarely,115.408622,American,Dinner,1,Casual,Dine-in,0,0,0,57.524294,5,5,3,0
2,656,41,Female,83737,Weekly,106.693771,American,Dinner,6,Celebration,Dine-in,0,1,0,48.682623,3,4,5,0
3,657,43,Male,96768,Rarely,43.508508,Indian,Lunch,1,Celebration,Dine-in,0,0,0,7.552993,4,5,1,0
4,658,55,Female,67937,Monthly,148.084627,Chinese,Breakfast,1,Business,Takeaway,0,0,1,37.789041,2,3,5,0


In [131]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CustomerID            1500 non-null   int64  
 1   Age                   1500 non-null   int64  
 2   Gender                1500 non-null   object 
 3   Income                1500 non-null   int64  
 4   VisitFrequency        1500 non-null   object 
 5   AverageSpend          1500 non-null   float64
 6   PreferredCuisine      1500 non-null   object 
 7   TimeOfVisit           1500 non-null   object 
 8   GroupSize             1500 non-null   int64  
 9   DiningOccasion        1500 non-null   object 
 10  MealType              1500 non-null   object 
 11  OnlineReservation     1500 non-null   int64  
 12  DeliveryOrder         1500 non-null   int64  
 13  LoyaltyProgramMember  1500 non-null   int64  
 14  WaitTime              1500 non-null   float64
 15  ServiceRating        

Unnamed: 0,CustomerID,Age,Income,AverageSpend,GroupSize,OnlineReservation,DeliveryOrder,LoyaltyProgramMember,WaitTime,ServiceRating,FoodRating,AmbianceRating,HighSatisfaction
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,1403.5,43.832,85921.89,105.659004,5.035333,0.296667,0.405333,0.48,30.16355,3.044,2.997333,2.987333,0.134
std,433.157015,14.967157,38183.051749,52.381849,2.558864,0.456941,0.49112,0.499766,17.214184,1.423405,1.41892,1.450716,0.340766
min,654.0,18.0,20012.0,10.306127,1.0,0.0,0.0,0.0,0.00138,1.0,1.0,1.0,0.0
25%,1028.75,31.75,52444.0,62.287907,3.0,0.0,0.0,0.0,15.235423,2.0,2.0,2.0,0.0
50%,1403.5,44.0,85811.0,104.626408,5.0,0.0,0.0,0.0,30.044055,3.0,3.0,3.0,0.0
75%,1778.25,57.0,119159.25,148.64933,7.0,1.0,1.0,1.0,45.285649,4.0,4.0,4.0,0.0
max,2153.0,69.0,149875.0,199.973527,9.0,1.0,1.0,1.0,59.970762,5.0,5.0,5.0,1.0


In [133]:
df.drop(['CustomerID'], axis=1, inplace=True)
print(df.dtypes)
# Defining feature and target columns
feature_columns = ['Age', 'Gender', 'Income', 'VisitFrequency', 'AverageSpend', 'PreferredCuisine', 'TimeOfVisit', 'GroupSize', 'DiningOccasion', 'MealType', 'OnlineReservation',
                    'DeliveryOrder', 'LoyaltyProgramMember', 'WaitTime', 'ServiceRating', 'FoodRating', 'AmbianceRating']
target_column = 'HighSatisfaction'

# Split data into features and target
X = df[feature_columns]
y = df[target_column]

Age                       int64
Gender                   object
Income                    int64
VisitFrequency           object
AverageSpend            float64
PreferredCuisine         object
TimeOfVisit              object
GroupSize                 int64
DiningOccasion           object
MealType                 object
OnlineReservation         int64
DeliveryOrder             int64
LoyaltyProgramMember      int64
WaitTime                float64
ServiceRating             int64
FoodRating                int64
AmbianceRating            int64
HighSatisfaction          int64
dtype: object


In [134]:
# Defining Categorical and numerical features
num_feature = ['Age', 'Income', 'AverageSpend', 'GroupSize', 'OnlineReservation', 'DeliveryOrder', 'LoyaltyProgramMember', 'WaitTime', 'ServiceRating', 'FoodRating', 'AmbianceRating']
categorical_feature = ['Gender', 'VisitFrequency', 'PreferredCuisine', 'TimeOfVisit', 'DiningOccasion', 'MealType']

In [135]:
# Define transformers for preprocessing
preprocessor = ColumnTransformer( transformers= [ ('num', StandardScaler(), num_feature), ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_feature) ] )

In [138]:
pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('classifier', LogisticRegression(max_iter=1000)) ] )

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

In [139]:
#Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.91
Confusion Matrix:
[[252   7]
 [ 20  21]]
