1: Loan Approval Prediction

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load dataset
data = pd.read_csv("C:\\Users\\lasya\\Downloads\\LoanEligibility.csv")

data = data.dropna()
# Split features and target
X = data.drop("Approved", axis=1)
y = data["Approved"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Models
log_reg = LogisticRegression(max_iter=1000)
dt = DecisionTreeClassifier()

# Train models
log_reg.fit(X_train, y_train)
dt.fit(X_train, y_train)

# Accuracy
print("Logistic Regression Accuracy:",
      accuracy_score(y_test, log_reg.predict(X_test)))

print("Decision Tree Accuracy:",
      accuracy_score(y_test, dt.predict(X_test)))

# New applicants
new_applicants = pd.DataFrame({
    "ApplicantID":[11,12,13],
    "Age":[34,29,42],
    "Income(₹000)":[48,38,55],
    "LoanAmount(₹000)":[125,110,140],
    "CreditScore":[720,690,700]
})

print("Logistic Regression Prediction:",
      log_reg.predict(new_applicants))

print("Decision Tree Prediction:",
      dt.predict(new_applicants))


Logistic Regression Accuracy: 1.0
Decision Tree Accuracy: 1.0
Logistic Regression Prediction: [1. 1. 1.]
Decision Tree Prediction: [1. 1. 1.]


2: Employee Attrition Prediction

In [6]:
from sklearn.ensemble import RandomForestClassifier

# Load dataset
data = pd.read_csv("C:\\Users\\lasya\\Downloads\\Logistic_EmployeeAttrition.csv")
data = data.dropna()
# Split features and target
X = data.drop("Attrition", axis=1)
y = data["Attrition"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Models
log_reg = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier()

# Train models
log_reg.fit(X_train, y_train)
rf.fit(X_train, y_train)

# Accuracy
print("Logistic Regression Accuracy:",
      accuracy_score(y_test, log_reg.predict(X_test)))

print("Random Forest Accuracy:",
      accuracy_score(y_test, rf.predict(X_test)))

# New employees
new_employees = pd.DataFrame({
    "EmpID":[11,12],
    "Age":[31,27],
    "YearsAtCompany":[3,1],
    "MonthlyIncome(₹000)":[40,32],
    "WorkLifeBalance":[3,2]
})

print("Logistic Regression Prediction:",
      log_reg.predict(new_employees))

print("Random Forest Prediction:",
      rf.predict(new_employees))


Logistic Regression Accuracy: 0.0
Random Forest Accuracy: 0.0
Logistic Regression Prediction: [1. 1.]
Random Forest Prediction: [1. 0.]


3: Student Performance Classification

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

data = pd.read_csv("C:\\Users\\lasya\\Downloads\\StudentPerformance.csv")
data = data.dropna()

# SAFE method
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

dt = DecisionTreeClassifier()
knn = KNeighborsClassifier(n_neighbors=3)

dt.fit(X_train, y_train)
knn.fit(X_train, y_train)

print("Decision Tree Accuracy:", accuracy_score(y_test, dt.predict(X_test)))
print("KNN Accuracy:", accuracy_score(y_test, knn.predict(X_test)))


Decision Tree Accuracy: 1.0
KNN Accuracy: 1.0


4:Productpurchase

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
data = pd.read_csv("C:\\Users\\lasya\\Downloads\\ProductPurchase.csv")

# Remove missing values
data = data.dropna()

# SAFE feature-target split
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Models
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(random_state=42)

# Train models
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)

# Accuracy
print("Decision Tree Accuracy:", accuracy_score(y_test, dt.predict(X_test)))
print("Random Forest Accuracy:", accuracy_score(y_test, rf.predict(X_test)))

# New prediction data (MATCH COLUMN NAMES)
new_customers = pd.DataFrame({
    "CustomerID":[11,12],
    "Age":[33,38],
    "Income(₹000)":[42,50],
    "VisitedWebsite":[4,5],
    "PreviousPurchases":[1,2]
})

# Match column order safely
new_customers = new_customers[X.columns]

print("Decision Tree Prediction:", dt.predict(new_customers))
print("Random Forest Prediction:", rf.predict(new_customers))


Decision Tree Accuracy: 0.3333333333333333
Random Forest Accuracy: 0.3333333333333333
Decision Tree Prediction: [0. 1.]
Random Forest Prediction: [1. 1.]


5.Customer Churn Prediction

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


data = pd.read_csv("C:\\Users\\lasya\\Downloads\\ProductPurchase.csv")

# Strip any leading/trailing spaces from column names
data.columns = data.columns.str.strip()

print("Before cleaning:", data["Purchased"].unique())

# Map Yes/No variants to 1/0
data["Purchased"] = data["Purchased"].replace({
    "Yes": 1, "yes": 1, "Y": 1,
    "No": 0, "no": 0, "N": 0
})

# Convert anything invalid to NaN
data["Purchased"] = pd.to_numeric(data["Purchased"], errors="coerce")

# Remove rows where Purchased is NaN
data = data.dropna(subset=["Purchased"])

print("Rows left after cleaning:", len(data))
print("After cleaning:", data["Purchased"].unique())

X = data.drop(["Purchased", "CustomerID"], axis=1)
y = data["Purchased"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(random_state=42)
# TRAIN MODELS

lr.fit(X_train, y_train)
rf.fit(X_train, y_train)

#MODEL ACCURACY

print("Logistic Regression Accuracy:",
      accuracy_score(y_test, lr.predict(X_test)))

print("Random Forest Accuracy:",
      accuracy_score(y_test, rf.predict(X_test)))

# NEW CUSTOMER PREDICTION

new_customers = pd.DataFrame({
    "Age": [33, 38],
    "Income(₹000)": [42, 50],  # match CSV column
    "VisitedWebsite": [1, 0],
    "PreviousPurchases": [1, 2]
})

# Match column order
new_customers = new_customers.reindex(columns=X.columns)

# Make predictions
lr_pred = lr.predict(new_customers)
rf_pred = rf.predict(new_customers)

# Convert 1/0 → Yes/No
lr_pred_text = ["Yes" if x==1 else "No" for x in lr_pred]
rf_pred_text = ["Yes" if x==1 else "No" for x in rf_pred]

print("Logistic Regression Prediction:", lr_pred_text)
print("Random Forest Prediction:", rf_pred_text)


Before cleaning: [ 0.  1. nan]
Rows left after cleaning: 10
After cleaning: [0. 1.]
Logistic Regression Accuracy: 0.3333333333333333
Random Forest Accuracy: 0.3333333333333333
Logistic Regression Prediction: ['No', 'Yes']
Random Forest Prediction: ['No', 'Yes']


6.CreditCardDefault

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


data = pd.read_csv("C:\\Users\\lasya\\Downloads\\CreditCardDefault.csv")
data.columns = data.columns.str.strip()  # remove any extra spaces


# Check target column
print("Unique values in Default column before cleaning:", data["Default"].unique())

# Convert to numeric (if not already)
data["Default"] = pd.to_numeric(data["Default"], errors="coerce")

# Drop rows where target is NaN
data = data.dropna(subset=["Default"])
print("Rows after cleaning:", len(data))


X = data.drop("Default", axis=1)
y = data["Default"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
#  CREATE MODELS
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)

#  TRAIN MODELS

dt.fit(X_train, y_train)
rf.fit(X_train, y_train)

#  MODEL ACCURACY

print("Decision Tree Accuracy:", accuracy_score(y_test, dt.predict(X_test)))
print("Random Forest Accuracy:", accuracy_score(y_test, rf.predict(X_test)))

#  NEW CUSTOMER PREDICTION

new_customers = pd.DataFrame({
    "CID": [11, 12],
    "Balance": [12500, 9000],
    "Purchases": [4, 3],
    "LatePayments": [2, 0],
    "Income": [42000, 48000]
})

# Reorder columns to match training data
new_customers = new_customers[X.columns]

# Predictions
dt_pred = dt.predict(new_customers)
rf_pred = rf.predict(new_customers)

# Convert 1/0 → Yes/No for readability
dt_pred_text = ["Yes" if x==1 else "No" for x in dt_pred]
rf_pred_text = ["Yes" if x==1 else "No" for x in rf_pred]

print("Decision Tree Prediction:", dt_pred_text)
print("Random Forest Prediction:", rf_pred_text)


Unique values in Default column before cleaning: [ 0.  1. nan]
Rows after cleaning: 10
Decision Tree Accuracy: 0.3333333333333333
Random Forest Accuracy: 0.3333333333333333
Decision Tree Prediction: ['Yes', 'No']
Random Forest Prediction: ['Yes', 'No']


7.Health Risk Classification

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

data = pd.read_csv("C:\\Users\\lasya\\Downloads\\Health_Risk_Classification.csv")
data.columns = data.columns.str.strip()  # remove extra spaces

target_col = 'RiskCategory'

# Drop ID column
X = data.drop([target_col, 'PatientID'], axis=1)
y = data[target_col]


for col in X.columns:
    if X[col].dtype in ['int64', 'float64']:
        X[col] = X[col].fillna(X[col].median())


y = y.fillna(method='ffill')  



print("Missing values in X:\n", X.isna().sum())
print("Missing values in y:\n", y.isna().sum())

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# CREATE MODELS

dt = DecisionTreeClassifier(random_state=42)
knn = KNeighborsClassifier(n_neighbors=3)
#  TRAIN MODELS

dt.fit(X_train, y_train)
knn.fit(X_train, y_train)
# ACCURACY

print("Decision Tree Accuracy:", accuracy_score(y_test, dt.predict(X_test)))
print("KNN Accuracy:", accuracy_score(y_test, knn.predict(X_test)))

# NEW PATIENTS PREDICTION

new_patients = pd.DataFrame({
    "Age":[35,50,42,28],
    "BMI":[24,30,27,22],
    "BloodPressure":[85,95,88,78],
    "Cholesterol":[180,220,200,170],
    "SmokingYears":[5,20,10,0]
})

# Ensure columns match
new_patients = new_patients[X.columns]

# Predictions
dt_pred = dt.predict(new_patients)
knn_pred = knn.predict(new_patients)

print("Decision Tree Prediction:", dt_pred)
print("KNN Prediction:", knn_pred)


Missing values in X:
 Age              0
BMI              0
BloodPressure    0
Cholesterol      0
SmokingYears     0
dtype: int64
Missing values in y:
 0
Decision Tree Accuracy: 0.4
KNN Accuracy: 0.4
Decision Tree Prediction: ['Medium Risk' 'High Risk' 'Medium Risk' 'Medium Risk']
KNN Prediction: ['Medium Risk' 'High Risk' 'Medium Risk' 'Medium Risk']


  y = y.fillna(method='ffill')  # forward fill as safe approach
