In [None]:
# Cleaning
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Imbalance
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss
from collections import Counter

# Models
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import (VotingClassifier, BaggingClassifier, RandomForestClassifier, 
                              AdaBoostClassifier, GradientBoostingClassifier)
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                             confusion_matrix, roc_curve, roc_auc_score, mean_squared_error, 
                             mean_absolute_error)

In [None]:
df['target'].value_counts()
sns.countplot(x='target', data=df, palette='RdBu_r')

In [None]:
# UnderSampling
y = df.pop('target')
nm = NearMiss()
x_res, y_res = nm.fit_resample(df, y)
print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))

# Oversampling
smk = SMOTETomek(random_state=42)
xdata,ydata=smk.fit_resample(data,y)
print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(ydata)))

# After Any Resampling is done
df = xdata
df['target_col_name'] = ydata

In [None]:
# Feature Selection
def correlation(dataset,threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range (i):
            if abs(corr_matrix.iloc[i,j])>threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

corr_features = correlation(df,0.65)
len(set(corr_features))
df.drop(corr_features,axis=1,inplace=True)

In [None]:
# Essenstial Plots
sns.countplot(x='Survived', data=df)
sns.boxplot(x='Survived', y='Fare', data=df)
sns.barplot(x='Gender', y='Fare', data=df)
sns.histplot(df['Age'], kde=True)
sns.lineplot(x='Age', y='Fare', data=df)
sns.scatterplot(x='Age', y='Fare', hue='Survived', data=df)
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='Blues')
df.groupby('Age')['Fare'].sum().plot(kind='area')
conf_matrix = confusion_matrix(y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt='d',cmap='Blues')


In [None]:
# Model Selection Syntax
X = df.drop(columns=['Level'])
y = df['Level']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=0)
y_train_pred = knn.predict(X_train_final)
train_accuracy = accuracy_score(y_train_final, y_train_pred)
y_val_pred = knn.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
y_test_pred = knn.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)


# KNN
knn = KNeighborsClassifier(n_neighbors=5)#,algorithm='kd_tree') #uncomment kd_tree to use its algorithm
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
acc = accuracy_score(y_pred,y_test)
print(acc*100)

# DTree Classifier
clf = DecisionTreeClassifier(criterion='entropy')# or gini, ccp_alpha=0.015 max_depth=? max_leaf_samples)
clf.fit(X_train_final, y_train_final)
y_train_pred = clf.predict(X_train_final)
train_accuracy = accuracy_score(y_train_final, y_train_pred)
y_val_pred = clf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
y_test_pred = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Adaboost
base_estimator = DecisionTreeClassifier(max_depth=1)
model = AdaBoostClassifier(estimator=base_estimator,n_estimators=20)
model.fit(X_train,y_train)

# xgboost
xgb_model = XGBClassifier(random_state=0,use_label_encoder=False,eval_metric='logloss')
xgb_model.fit(X_train_main,y_train_main)
xgb_train_acc = xgb_model.score(X_train_main,y_train_main)
xgb_test_acc = xgb_model.score(X_test,y_test)

#Random Forest
model = RandomForestClassifier(n_estimators=15,max_depth=2)
model.fit(X_train,y_train) 

# Voting Classifier
estimators=[('dt', model 1), ('knn', model2), ('rf',model3), ('xgb', model4)]
voting_clf = VotingClassifier(estimators=estimators, voting='hard' // 'soft')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
voting_clf.fit(X_train, y_train)
accuracy = voting_clf.score(X_test, y_test)

# XGBoost
xgb = GradientBoostingClassifier(random_state=42,max_depth=10,n_estimators=10)
xgb.fit(X_train, y_train)


In [None]:
# Evaluation Metrics
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, knn.predict_proba(X_test)[:, 1])
rf_precision = precision_score(y_test, ypred)
rf_recall = recall_score(y_test, ypred)

fpr, tpr, thresholds = roc_curve(y_test, knn.predict_proba(X_test)[:, 1])
plt.plot(fpr, tpr, color='red', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='blue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')

knn_scores =[]
for train_index, test_index in skf.split(X, y):
    X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
    y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]
    knn.fit(X_train_cv, y_train_cv)
    knn_scores.append(accuracy_score(y_test_cv, knn.predict(X_test_cv)))
knn_avg_score = sum(knn_scores) / len(knn_scores)

TN, FP, FN, TP = cm.ravel()
print(f'TP: {TP}, TN: {TN}, FP: {FP}, FN: {FN}')

In [None]:
# Pandas Essential Functions for preprocessing

- df.head()
- df.tail()
- df.shape
- df.columns
- df.info()
- df.describe()
- df.nunique()

- for col in df.columns:
	print('------------------------------')
	print(df['col'].unique())

- for col in df.columns:
	print('------------------------------')
	print(df['col'].value_counts)

- df.isnull().sum()

- df.dropna()
- df.fillna()
- df.fillna(df.mean(), inplace=True)
- df.fillna(df.median(), inplace=True)
- df.fillna(df.mode(), inplace=True)
- df.fillna(df.pad(), inplace=True)
- df[colname].fillna(df[colname].mean(),inplace=True)
- df.fillna(method='bfill')
- df.fillna(method='ffill')
- df[col].fillna(method='bfill or ffil')
- df.duplicated(keep=False).sum()
- df.drop_duplicates(keep='first')

- df.drop('col', axis=1, inplace=True)
- df.drop(axis=0, index=1, inplace=True)

- df.insert(index, 'col', data)
- df.append(df2, ignore_index=True)

df3 = pd.concat([df1, df2], axis=0, ignore_index=True) -> Concatenate vertically (adding rows)
df_combined = pd.concat([df1, df2], axis=1) -> Concatenate horizontally (adding columns)
df_merged = pd.merge(df1, df2, how='inner', on='key') -> Merge DataFrames on the 'key' column

- df.reset_index(drop=False, inplace=True)
- df.set_index('col')
- df.sort_values('col', inplace=True)
- df['col'][df['col']==value] == new_value
- df['col']
- df[['col1', 'col2']]
- column = df.pop('col')

- df['colname'].replace('?', np.nan, inplace=True)
- df['Bare_Nuclei'] = df['Bare_Nuclei'].astype('float64')
- def fillValue(value): if value == 'High':return 3 elif value == 'Medium':return 2 elif value == 'Low':return 1
- df['Level'] = df['Level'].apply(fillValue)

# One Hot Encoding
df = pd.get_dummies(df,drop_first=True)

# MinMax Scaling
from sklearn.preprocessing import MinMaxScaler
cols = ['Fare','Age']
scaler = MinMaxScaler()
df[cols] = scaler.fit_transform(df[cols])

# Standard Scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

