# **Exercise 1: Predicting Diabetes Onset Using Logistic Regression**

In [5]:
from google.colab import files
uploaded = files.upload()

Saving diabetes.csv to diabetes (1).csv


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

data = pd.read_csv(list(uploaded.keys())[0])

In [8]:
# Handle missing values (replace 0 with NaN and then fill with mean)
data.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.nan)
data.fillna(data.mean(), inplace=True)

# Standardize features
X = data.drop('Outcome', axis=1)
y = data['Outcome']
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.7532467532467533
Precision: 0.6666666666666666
Recall: 0.6181818181818182


# **Exercise 2: Classifying Iris Species Using Decision Trees**

In [13]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Implement Decision Tree Classifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model Ealuation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 1.0
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


# **Exercise 3: Predicting Titanic Survival Using Logistic Regression**

In [52]:
from google.colab import files
uploaded = files.upload()

Saving titanic.csv to titanic (1).csv


In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

data = pd.read_csv(list(uploaded.keys())[0])

In [54]:
# Handle Missing Values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Encode categorical variables and standardize numerical features
num_features = ['Age', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked']

preprocessor = ColumnTransformer (transformers=[ ('num', StandardScaler(), num_features), ('cat', OneHotEncoder(), categorical_features) ] )

# Define the target and features
X = data.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
y = data['Survived']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
pipeline = Pipeline( steps= [ ('preprocessor', preprocessor), ('classifier', LogisticRegression()) ] )

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

# Evaluate model
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"ROC-AUC: {roc_auc}")

ROC-AUC: 0.8754182754182754


# **Exercise 4: Classifying Emails as Spam Using Decision Trees**

In [58]:
from google.colab import files
uploaded = files.upload()

Saving email_spam.csv to email_spam (5).csv


In [67]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv(list(uploaded.keys())[0], encoding='latin1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [68]:
print(df.head())
print(df.columns)

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [69]:
# Dropping unnecessary columns
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

# Renaming columns for convenience
df.rename(columns={'v1': 'label', 'v2': 'message'}, inplace=True)

# Encoding the labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Check for missing values
print(df.isnull().sum())

# Split the data
X = df['message']
y = df['label']

label      0
message    0
dtype: int64


In [70]:
# Converting text data into numerical features using TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)

# Transforming text data to feature vectors
X = tfidf.fit_transform(X).toarray()

# Splitting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Precision: 0.8943661971830986
Recall: 0.8466666666666667
F1 Score: 0.8698630136986302


# **Exercise 5: Predicting Customer Satisfaction Using Logistic Regression**

In [71]:
from google.colab import files
uploaded = files.upload()

Saving restaurant_customer_satisfaction.csv to restaurant_customer_satisfaction.csv


In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

df = pd.read_csv(list(uploaded.keys())[0], encoding='utf-8')
df.head()


Unnamed: 0,CustomerID,Age,Gender,Income,VisitFrequency,AverageSpend,PreferredCuisine,TimeOfVisit,GroupSize,DiningOccasion,MealType,OnlineReservation,DeliveryOrder,LoyaltyProgramMember,WaitTime,ServiceRating,FoodRating,AmbianceRating,HighSatisfaction
0,654,35,Male,83380,Weekly,27.829142,Chinese,Breakfast,3,Business,Takeaway,0,1,1,43.523929,2,5,4,0
1,655,19,Male,43623,Rarely,115.408622,American,Dinner,1,Casual,Dine-in,0,0,0,57.524294,5,5,3,0
2,656,41,Female,83737,Weekly,106.693771,American,Dinner,6,Celebration,Dine-in,0,1,0,48.682623,3,4,5,0
3,657,43,Male,96768,Rarely,43.508508,Indian,Lunch,1,Celebration,Dine-in,0,0,0,7.552993,4,5,1,0
4,658,55,Female,67937,Monthly,148.084627,Chinese,Breakfast,1,Business,Takeaway,0,0,1,37.789041,2,3,5,0


In [80]:
df.drop(['CustomerID'], axis=1, inplace=True)
print(df.dtypes)
# Defining feature and target columns
feature_columns = ['Age', 'Gender', 'Income', 'VisitFrequency', 'AverageSpend', 'PreferredCuisine', 'TimeOfVisit', 'GroupSize', 'DiningOccasion', 'MealType', 'OnlineReservation',
                    'DeliveryOrder', 'LoyaltyProgramMember', 'WaitTime', 'ServiceRating', 'FoodRating', 'AmbianceRating']
target_column = 'HighSatisfaction'

# Split data into features and target
X = df[feature_columns]
y = df[target_column]

Age                       int64
Gender                   object
Income                    int64
VisitFrequency           object
AverageSpend            float64
PreferredCuisine         object
TimeOfVisit              object
GroupSize                 int64
DiningOccasion           object
MealType                 object
OnlineReservation         int64
DeliveryOrder             int64
LoyaltyProgramMember      int64
WaitTime                float64
ServiceRating             int64
FoodRating                int64
AmbianceRating            int64
HighSatisfaction          int64
dtype: object


In [81]:
print(X.dtypes)

Age                       int64
Gender                   object
Income                    int64
VisitFrequency           object
AverageSpend            float64
PreferredCuisine         object
TimeOfVisit              object
GroupSize                 int64
DiningOccasion           object
MealType                 object
OnlineReservation         int64
DeliveryOrder             int64
LoyaltyProgramMember      int64
WaitTime                float64
ServiceRating             int64
FoodRating                int64
AmbianceRating            int64
dtype: object


In [84]:
# Defining Categorical and numerical features
num_features = ['Age', 'Income', 'AverageSpend', 'GroupSize', 'OnlineReservation', 'DeliveryOrder', 'LoyaltyProgramMember', 'WaitTime', 'ServiceRating', 'FoodRating', 'AmbianceRating']
cat_features = ['Gender', 'VisitFrequency', 'PreferredCuisine', 'TimeOfVisit', 'DiningOccasion', 'MealType']

# Define transformers for preprocessing
preprocessor = ColumnTransformer( transformers= [ ('num', StandardScaler(), num_features), ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features) ] )

# Creating pipeline with preprocessing and logistic regression
pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('classifier', LogisticRegression(max_iter=1000)) ] )

# Spliting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

#Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.91
Confusion Matrix:
[[252   7]
 [ 20  21]]
