In [9]:
#1 Predicting Employee Attrition Using Logistic Regression
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [2]:
data = pd.read_csv('HR-Employee-Attrition.csv')

In [4]:
# Checking for missing values
print(data.isnull().sum())


Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [11]:
print(data.columns)
print(data.dtypes)


Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')
Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeC

In [12]:
#Taking Attrition as the target column
target = 'Attrition'

# Identifying categorical and numerical columns
categorical_cols = [col for col in data.columns if data[col].dtype == 'object' and col != target]
numerical_cols = [col for col in data.columns if data[col].dtype in ['int64', 'float64']]

# Standardization
numerical_transformer = StandardScaler()

# One-hot encoding
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [13]:
# Defining the logistic regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression(max_iter=1000))])


In [14]:
# Features and Target
X = data.drop(columns=target)
y = data[target]

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
# Training the model
model.fit(X_train, y_train)


In [18]:
# Making predictions on the test set
y_pred = model.predict(X_test)


In [19]:
# Evaluating the model
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')
f1 = f1_score(y_test, y_pred, pos_label='Yes')

# Printing evaluation metrics
print('Precision: {:.2f}'.format(precision))
print('Recall: {:.2f}'.format(recall))
print('F1 Score: {:.2f}'.format(f1))

# Detailed classification report
print(classification_report(y_test, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', conf_matrix)


Precision: 0.64
Recall: 0.46
F1 Score: 0.54
              precision    recall  f1-score   support

          No       0.92      0.96      0.94       255
         Yes       0.64      0.46      0.54        39

    accuracy                           0.89       294
   macro avg       0.78      0.71      0.74       294
weighted avg       0.88      0.89      0.89       294

Confusion Matrix:
 [[245  10]
 [ 21  18]]


In [20]:
#2 Classifying Credit Card Fraud Using Decision Trees
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report


In [21]:
# Loading the dataset
data = pd.read_csv('creditcard.csv')


In [22]:
# Checking for missing values
print(data.isnull().sum())




Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [23]:
# Standardizing the features
scaler = StandardScaler()

# By taking Class as the target variable
X = data.drop(columns='Class')
y = data['Class']

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the scaler on training data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [24]:
# Defining the decision tree classifier
model = DecisionTreeClassifier(random_state=42)

# Training the model
model.fit(X_train, y_train)


In [25]:
# Making predictions on the test set
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]


In [26]:
# Evaluating the model using ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Printing ROC-AUC
print('ROC-AUC: {:.2f}'.format(roc_auc))

# Evaluating the model using confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', conf_matrix)

# Detailed classification report
print(classification_report(y_test, y_pred))


ROC-AUC: 0.90
Confusion Matrix:
 [[56830    34]
 [   20    78]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.70      0.80      0.74        98

    accuracy                           1.00     56962
   macro avg       0.85      0.90      0.87     56962
weighted avg       1.00      1.00      1.00     56962



In [41]:
#3 Predicting Heart Disease using logistic regression
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report



In [42]:
# Loading the dataset
data = pd.read_csv('heart.csv')


In [43]:
# Checking for missing values
print(data.isnull().sum())



age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [44]:
print("Columns in the dataset:", data.columns.tolist())
print("Data types of columns:\n", data.dtypes)


Columns in the dataset: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
Data types of columns:
 age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object


In [45]:
# By taking target as the target variable
target = 'target'

# Identifying categorical and numerical columns
categorical_cols = [col for col in data.columns if data[col].dtype == 'object' and col != target]
numerical_cols = [col for col in data.columns if data[col].dtype in ['int64', 'float64'] and col != target]

print("Categorical columns: ", categorical_cols)
print("Numerical columns: ", numerical_cols)


Categorical columns:  []
Numerical columns:  ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']


In [46]:
# Standardization
numerical_transformer = StandardScaler()

# One-hot encoding
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [47]:
# Logistic regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression(max_iter=1000))])


In [48]:
# Features and target
X = data.drop(columns=target)
y = data[target]

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [49]:
# Training the model
model.fit(X_train, y_train)


In [50]:
# Making predictions on the test set
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]


In [51]:
# Evaluating the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

# Evaluating the model using ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_prob)
print('ROC-AUC: {:.2f}'.format(roc_auc))

# Detailed classification report
print(classification_report(y_test, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', conf_matrix)


Accuracy: 0.80
ROC-AUC: 0.88
              precision    recall  f1-score   support

           0       0.85      0.72      0.78       102
           1       0.76      0.87      0.81       103

    accuracy                           0.80       205
   macro avg       0.80      0.79      0.79       205
weighted avg       0.80      0.80      0.79       205

Confusion Matrix:
 [[73 29]
 [13 90]]


In [52]:
#4 Classifying Emails as Spam Using Decision Trees

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Loading the dataset
df = pd.read_csv('emails.csv')

# Checking for missing values and handling them
df.dropna(subset=['text', 'spam'], inplace=True)

# Encoding the target variable (categorical)
label_encoder = LabelEncoder()
df['spam'] = label_encoder.fit_transform(df['spam'])

# Features and target variable
X = df['text']
y = df['spam']

# Training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorizing the text data
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Standardizing the features
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train_vectorized)
X_test_scaled = scaler.transform(X_test_vectorized)

# Train a Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train_scaled, y_train)

# Making predictions
y_pred = clf.predict(X_test_scaled)

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9589877835951134
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       856
           1       0.92      0.91      0.92       290

    accuracy                           0.96      1146
   macro avg       0.95      0.94      0.95      1146
weighted avg       0.96      0.96      0.96      1146



In [53]:
#5  Predicting Customer Satisfaction Using Logistic Regression

import pandas as pd

# Load the dataset
data = pd.read_csv('Customer-survey-data.csv')


In [54]:
data.isnull().sum()


Customer ID                               0
Overall Delivery Experience (Rating)    418
Food Quality (Rating)                   252
Speed of Delivery (Rating)              239
Order Accuracy                          660
dtype: int64

In [55]:

mode_order_accuracy = data['Order Accuracy'].mode()[0]
data['Order Accuracy'] = data['Order Accuracy'].fillna(mode_order_accuracy)


In [56]:

# Handle missing values for numerical data
numerical_columns = ['Overall Delivery Experience (Rating)', 'Food Quality (Rating)', 'Speed of Delivery (Rating)']
data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].median())




In [57]:
data['Customer Satisfaction'] = (data[['Overall Delivery Experience (Rating)', 'Food Quality (Rating)', 'Speed of Delivery (Rating)']].mean(axis=1) > 3).astype(int)

In [58]:
# Encoding 'Order Accuracy' column
data['Order Accuracy'] = data['Order Accuracy'].map({'Yes': 1, 'No': 0})


In [59]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Standardizing numerical columns
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])


In [60]:

X = data[['Overall Delivery Experience (Rating)', 'Food Quality (Rating)', 'Speed of Delivery (Rating)', 'Order Accuracy']]
y = data['Customer Satisfaction']


In [61]:
from sklearn.model_selection import train_test_split

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Training the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(conf_matrix)


Accuracy: 1.00
Confusion Matrix:
[[1044    0]
 [   0 1080]]
