In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [None]:
df = pd.read_csv(r"Life Expectancy Data.csv")
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [None]:
# Instead of using inplace=True, assign back to the column directly
df['Year'] = df['Year'].fillna(df['Year'].median())
df['Schooling'] = df['Schooling'].fillna(df['Schooling'].mode()[0])

# Dropping the 'Status' column
df.drop('Status', axis=1, inplace=True)

In [None]:
label_enc = LabelEncoder()
df['Country'] = label_enc.fit_transform(df['Country'])
df['Schooling'] = label_enc.fit_transform(df['Schooling'])

In [None]:
print(df.columns)

X = df.drop('Life expectancy ', axis=1)

y = df['Country']

Index(['Country', 'Year', 'Life expectancy ', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    # Changed to use 'weighted' averaging for multiclass
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    print(f'---{model_name}---')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')
    print(f'Confusion Matrix:\n{confusion_matrix(y_true, y_pred)}')
    print('-'*40)

### 1. Logistic Regression

In [None]:
from sklearn.impute import SimpleImputer
import warnings

warnings.filterwarnings('ignore')
imputer = SimpleImputer(strategy='mean')

X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)
evaluate_model(y_test, y_pred_lr, "Logistic Regression")

---Logistic Regression---
Accuracy: 0.76
Precision: 0.82
Recall: 0.76
F1 Score: 0.76
Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 4 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 1]]
----------------------------------------


### 2. Naive Bayes

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
evaluate_model(y_test, y_pred_nb, "Naive Bayes")

---Naive Bayes---
Accuracy: 0.99
Precision: 0.99
Recall: 0.99
F1 Score: 0.99
Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 4 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 3]]
----------------------------------------


### 3. K-Nearest Neighbors (KNN)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
evaluate_model(y_test, y_pred_knn, "K-Nearest Neighbors")

---K-Nearest Neighbors---
Accuracy: 0.62
Precision: 0.69
Recall: 0.62
F1 Score: 0.62
Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 4 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]]
----------------------------------------


### 4. Decision Tree

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
evaluate_model(y_test, y_pred_dt, "Decision Tree")

---Decision Tree---
Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1 Score: 0.97
Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 4 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 3]]
----------------------------------------


### 5. Random Forest

In [None]:
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
evaluate_model(y_test, y_pred_rf, "Random Forest")

---Random Forest---
Accuracy: 0.99
Precision: 0.99
Recall: 0.99
F1 Score: 0.99
Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 4 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 3]]
----------------------------------------


### 6. KMeans (Unsupervised Clustering) for analysis, not classification

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_train)
y_pred_kmeans = kmeans.predict(X_test)

In [None]:
y_pred_kmeans = np.where(y_pred_kmeans == 0, 1, 0)

In [None]:
evaluate_model(y_test, y_pred_kmeans, "KMeans Clustering (adjusted for survival)")

---KMeans Clustering (adjusted for survival)---
Accuracy: 0.01
Precision: 0.00
Recall: 0.01
F1 Score: 0.00
Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [2 2 0 ... 0 0 0]
 ...
 [4 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [3 0 0 ... 0 0 0]]
----------------------------------------


In [None]:
print("\n--- Classification Reports ---")
print("Logistic Regression:\n", classification_report(y_test, y_pred_lr))
print("Naive Bayes:\n", classification_report(y_test, y_pred_nb))
print("K-Nearest Neighbors:\n", classification_report(y_test, y_pred_knn))
print("Decision Tree:\n", classification_report(y_test, y_pred_dt))
print("Random Forest:\n", classification_report(y_test, y_pred_rf))
print("KMeans (Clustering):\n", classification_report(y_test, y_pred_kmeans))


--- Classification Reports ---
Logistic Regression:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.40      1.00      0.57         2
           2       1.00      1.00      1.00         4
           3       1.00      0.67      0.80         3
           4       0.33      0.33      0.33         3
           5       0.67      0.67      0.67         3
           6       1.00      0.40      0.57         5
           7       1.00      1.00      1.00         2
           8       1.00      1.00      1.00         4
           9       0.50      1.00      0.67         2
          10       1.00      0.60      0.75         5
          11       1.00      1.00      1.00         5
          12       0.80      1.00      0.89         4
          13       0.83      1.00      0.91         5
          14       1.00      0.50      0.67         4
          15       1.00      0.33      0.50         3
          16       0.50    

# Evaluate the performance using classification metrics.


In [None]:
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    print(f'---{model_name}---')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')
    print(f'Confusion Matrix:\n{confusion_matrix(y_true, y_pred)}')
    print('-'*40)

# 1. Logistic Regression
y_pred_lr = log_reg.predict(X_test)
evaluate_model(y_test, y_pred_lr, "Logistic Regression")

# 2. Naive Bayes
y_pred_nb = nb_model.predict(X_test)
evaluate_model(y_test, y_pred_nb, "Naive Bayes")

# 3. K-Nearest Neighbors (KNN)
y_pred_knn = knn.predict(X_test)
evaluate_model(y_test, y_pred_knn, "K-Nearest Neighbors")

# 4. Decision Tree
y_pred_dt = dt_model.predict(X_test)
evaluate_model(y_test, y_pred_dt, "Decision Tree")

# 5. Random Forest
y_pred_rf = rf_model.predict(X_test)
evaluate_model(y_test, y_pred_rf, "Random Forest")

# 6. KMeans (Clustering)
y_pred_kmeans = np.where(kmeans.predict(X_test) == 0, 1, 0)
evaluate_model(y_test, y_pred_kmeans, "KMeans Clustering")

---Logistic Regression---
Accuracy: 0.76
Precision: 0.82
Recall: 0.76
F1 Score: 0.76
Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 4 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 1]]
----------------------------------------
---Naive Bayes---
Accuracy: 0.99
Precision: 0.99
Recall: 0.99
F1 Score: 0.99
Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 4 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 3]]
----------------------------------------
---K-Nearest Neighbors---
Accuracy: 0.62
Precision: 0.69
Recall: 0.62
F1 Score: 0.62
Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 4 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]]
----------------------------------------
---Decision Tree---
Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1 Score: 0.97
Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 4 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 3]]

#  Compare the performance of the entire classification algorithm (in table format having recall, precision, F1-score, accuracy))


In [None]:
model_performance = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1-Score": []
}

In [None]:
def evaluate_and_store_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')


    model_performance["Model"].append(model_name)
    model_performance["Accuracy"].append(accuracy)
    model_performance["Precision"].append(precision)
    model_performance["Recall"].append(recall)
    model_performance["F1-Score"].append(f1)


In [None]:

# 1. Logistic Regression
y_pred_lr = log_reg.predict(X_test)
evaluate_and_store_metrics(y_test, y_pred_lr, "Logistic Regression")

# 2. Naive Bayes
y_pred_nb = nb_model.predict(X_test)
evaluate_and_store_metrics(y_test, y_pred_nb, "Naive Bayes")

# 3. K-Nearest Neighbors (KNN)
y_pred_knn = knn.predict(X_test)
evaluate_and_store_metrics(y_test, y_pred_knn, "K-Nearest Neighbors")

# 4. Decision Tree
y_pred_dt = dt_model.predict(X_test)
evaluate_and_store_metrics(y_test, y_pred_dt, "Decision Tree")

# 5. Random Forest
y_pred_rf = rf_model.predict(X_test)
evaluate_and_store_metrics(y_test, y_pred_rf, "Random Forest")

# 6. KMeans (Clustering)
y_pred_kmeans = np.where(kmeans.predict(X_test) == 0, 1, 0)
evaluate_and_store_metrics(y_test, y_pred_kmeans, "KMeans Clustering")


In [None]:
performance_df = pd.DataFrame(model_performance)
performance_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,0.755102,0.815873,0.755102,0.755959
1,Naive Bayes,0.988095,0.986071,0.988095,0.985733
2,K-Nearest Neighbors,0.615646,0.690063,0.615646,0.619155
3,Decision Tree,0.97619,0.978225,0.97619,0.974226
4,Random Forest,0.994898,0.992687,0.994898,0.993445
5,KMeans Clustering,0.005102,2.7e-05,0.005102,5.4e-05


## Conclusion of this Mini Project :
In this project, we applied various classification algorithms and evaluated their performance using accuracy, precision, recall, and F1-score. Random Forest performed best due to its ensemble nature, while simpler models like Logistic Regression and Naive Bayes were faster but less accurate. The choice of the best model depends on the dataset's complexity and the need for either performance or interpretability.