# Naive Bayes Practice

In [180]:
import numpy as np
import pandas as pd


In [181]:
df = pd.read_csv('Social_Network_Ads.csv',usecols=['Age','EstimatedSalary','Purchased'])
df.head(3)

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0


In [182]:
x = df.drop(columns=['Purchased'])
y = df['Purchased']

In [183]:
from sklearn.model_selection import train_test_split

In [184]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [185]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train_new = sc.fit_transform(x_train)

In [186]:
x_test_new = sc.transform(x_test)

In [187]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

In [188]:
classifier.fit(x_train_new,y_train)

In [189]:
y_pred = classifier.predict(x_test_new)

In [190]:
from sklearn.metrics import accuracy_score

In [191]:
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy*100:.2f}")

Accuracy: 93.75


#  Credit Score Dataset

In [288]:
df = pd.read_csv("credit_scoring.csv")
df.head(3)

Unnamed: 0,Age,Gender,Marital Status,Education Level,Employment Status,Credit Utilization Ratio,Payment History,Number of Credit Accounts,Loan Amount,Interest Rate,Loan Term,Type of Loan
0,60,Male,Married,Master,Employed,0.22,2685.0,2,4675000,2.65,48,Personal Loan
1,25,Male,Married,High School,Unemployed,0.2,2371.0,9,3619000,5.19,60,Auto Loan
2,30,Female,Single,Master,Employed,0.22,2771.0,6,957000,2.76,12,Auto Loan


In [290]:
df.isnull().sum()

Age                          0
Gender                       0
Marital Status               0
Education Level              0
Employment Status            0
Credit Utilization Ratio     0
Payment History              0
Number of Credit Accounts    0
Loan Amount                  0
Interest Rate                0
Loan Term                    0
Type of Loan                 0
dtype: int64

In [292]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [294]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# -----------------------------
# Load dataset
# -----------------------------

# Features and Target
X = df.drop("Type of Loan", axis=1)
y = df["Type of Loan"]

# Split dataset
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_cols = [
    "Age", "Credit Utilization Ratio", "Payment History",
    "Number of Credit Accounts", "Loan Amount", "Interest Rate", "Loan Term"
]

scaler = StandardScaler()
x_train_num = scaler.fit_transform(x_train[numeric_cols])
x_test_num = scaler.transform(x_test[numeric_cols])

# -----------------------------
# 2. Categorical Columns (Encoding)
# -----------------------------
categoric_cols = ["Gender", "Marital Status", "Education Level", "Employment Status"]

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
x_train_cat = encoder.fit_transform(x_train[categoric_cols])
x_test_cat = encoder.transform(x_test[categoric_cols])

# -----------------------------
# 3. Combine Numeric + Categorical
# -----------------------------
x_train_clean = np.hstack([x_train_num, x_train_cat])
x_test_clean = np.hstack([x_test_num, x_test_cat])

# -----------------------------
# 4. Encode Target Variable
# -----------------------------
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# -----------------------------
# Final shapes check
# -----------------------------
print("x_train_clean shape:", x_train_clean.shape)
print("x_test_clean shape:", x_test_clean.shape)
print("y_train_enc shape:", y_train_enc.shape)
print("y_test_enc shape:", y_test_enc.shape)


x_train_clean shape: (800, 19)
x_test_clean shape: (200, 19)
y_train_enc shape: (800,)
y_test_enc shape: (200,)


In [299]:
classifier = GaussianNB()
classifier.fit(x_train_clean, y_train_enc)

# Predictions
y_pred = classifier.predict(x_test_clean)

# -----------------------------
# Accuracy and Report
# -----------------------------
accuracy = accuracy_score(y_test_enc, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%\n")


Accuracy: 31.00%



In [203]:
classifier.fit(x_train_new,y_train)
y_pred = classifier.predict(x_test_new)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy*100:.2f}")

Accuracy: 93.75


# Covid Dataset

In [205]:
df = pd.read_csv("covid_toy.csv")
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [207]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [209]:
si = SimpleImputer(strategy='mean')
df['fever'] = si.fit_transform(df[['fever']])
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [211]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
df['gender'] = lb.fit_transform(df['gender'])
df['cough'] = lb.fit_transform(df['cough'])
df['city'] = lb.fit_transform(df['city'])
df['has_covid'] = lb.fit_transform(df['has_covid'])

In [213]:
x = df.drop(columns=['has_covid'])
y = df['has_covid']


In [215]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [217]:
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy*100:.2f}")

Accuracy: 60.00


# titanic Dataset

In [317]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("titanic.csv")

# Drop irrelevant columns (like PassengerId, Name, Ticket, Cabin)
df = df.drop(columns=["PassengerId","Name","Ticket","Cabin"], errors="ignore")

# Separate target and features
X = df.drop("Survived", axis=1)
y = df["Survived"]

# Separate categorical and numeric columns
categorical = X.select_dtypes(include=["object"]).columns
numeric = X.select_dtypes(include=[np.number]).columns

# Handle missing values
si_num = SimpleImputer(strategy="mean")
X[numeric] = si_num.fit_transform(X[numeric])

si_cat = SimpleImputer(strategy="most_frequent")
X[categorical] = si_cat.fit_transform(X[categorical])

# Encode categorical variables
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_encoded = ohe.fit_transform(X[categorical])
X_encoded = pd.DataFrame(X_encoded, columns=ohe.get_feature_names_out(categorical))

# Combine numeric + categorical
X_final = pd.concat([X[numeric].reset_index(drop=True), X_encoded.reset_index(drop=True)], axis=1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42, stratify=y)

# Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression with hyperparameter tuning
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],   # Regularization strength
    "penalty": ["l1", "l2"],        # Regularization type
    "solver": ["liblinear"]         # Solver that supports L1 + L2
}

grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring="accuracy")
grid.fit(X_train, y_train)

# Best model
best_model = grid.best_estimator_
print("Best Parameters:", grid.best_params_)

# Predictions
y_pred = best_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

print(f"Accuracy: {accuracy*100:.2f}")


Best Parameters: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        53
           1       1.00      1.00      1.00        31

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84

Accuracy: 100.00
