In [None]:
# Importing RandomOverSampler from sklearn's impute module
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from scipy.stats import zscore

# Other necessary imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


In [None]:
sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
pd.set_option('display.max_columns',29)

In [None]:
# Loading the data from CSV file to pandas dataframe
df = pd.read_csv("data.csv")

In [None]:
# Printing first five rows of the data
df.head()

In [None]:
# Printing last five rows of the data
df.tail()

In [None]:
# Checking the number of rows and columns in our dataset
df.shape

In [None]:
# Getting more information of our dataset
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Getting some statistical information of our data
df.describe()

In [None]:
# distribution of our target variable i.e. "status" column/feature
df["status"].value_counts()

In [None]:
explode=(0.08,0)

df['status'].value_counts().plot.pie(autopct='%1.2f%%',figsize=(3,3),explode=explode,colors=['#99ff99','#ff6666'])
plt.title("Pie plot of distribution of status column", fontsize=14)
plt.tight_layout()
plt.legend()
plt.show()

In [None]:
df.groupby("status").mean()

In [None]:
sns.pairplot(df.drop("name", axis=1), hue="status", diag_kind="kde")

In [None]:
sns.set(font_scale=0.25)
cmap = sns.diverging_palette(260, 10, as_cmap=True)
sns.heatmap(df.drop(["name", "status"], axis=1).corr("spearman"), vmax=1.2, annot=True, square='square', cmap=cmap, fmt = '.0%', linewidths=2)

In [None]:
# Set the font scale for the plot
sns.set(font_scale=0.25)

# Create a correlation matrix using spearman method
corr_matrix = df.drop(["name", "status"], axis=1).corr(method="spearman")

# Create a diverging color palette for the heatmap
cmap = sns.diverging_palette(260, 10, as_cmap=True)

# Create the heatmap
plt.figure(figsize=(10, 8))  # Set the size of the plot
sns.heatmap(corr_matrix, vmax=1.2, annot=True, square=True, cmap=cmap, fmt=".0%", linewidths=2)

# Show the plot
plt.show()


In [None]:
# Calculate the correlation matrix
corr_matrix = df.drop(["name", "status"], axis=1).corr()

# Find highly correlated features
corr_threshold = 0.85
high_corr_features = set()  # Create a set to store correlated feature pairs

for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > corr_threshold:
            colname_i = corr_matrix.columns[i]
            colname_j = corr_matrix.columns[j]
            high_corr_features.add(colname_i)
            high_corr_features.add(colname_j)

# Convert the set to a list
high_corr_features_list = list(high_corr_features)

# Print the highly correlated features
print("Highly correlated features:", high_corr_features_list)


In [None]:
df_anova = df.drop(["name"], axis=1)
grps = pd.unique(df_anova.status.values)

coldrop = []

for i in range(len(df_anova.columns)-1):
    
    d_data = {grp:df_anova[df_anova.columns[i]][df_anova.status == grp] for grp in grps}

    F, p = stats.f_oneway(d_data[0], d_data[1])
    print("P_Value of {} and status".format(df_anova.columns[i]), p)

    if p < 0.05:
        print("There is relation between {} and status \n".format(df_anova.columns[i]))
    else:
        print("There is no relation between {} and status \n".format(df_anova.columns[i]))
        coldrop.append(df_anova.columns[i])

In [None]:
df.drop("name", axis=1).columns

In [None]:
from sklearn.feature_selection import VarianceThreshold

# Your DataFrame 'df' and other code here

var_thres = VarianceThreshold(threshold=0)
var_thres.fit(df.drop("name", axis=1))

var_support = var_thres.get_support()

selected_columns = df.drop("name", axis=1).columns[var_support]
print("Selected columns after variance threshold:", selected_columns)

constant_columns = [column for column in df.drop("name", axis=1).columns if column not in selected_columns]
print("Constant columns:", constant_columns)
print("Number of constant columns:", len(constant_columns))


In [None]:
# Assuming you calculated the correlated features earlier in your code
corr_features = ['feature1', 'feature2', ...]  # List of correlated features

# Assuming you have defined the coldrop list as well
coldrop = ['feature3', 'feature4', ...]  # List of columns to drop

# Combining correlated features and columns to drop
list_drop = corr_features + coldrop
list_drop.append("name")

# Print the final list of columns to drop
print("Columns to drop:", list_drop)


In [None]:
# Display the current column names in the DataFrame
print("Current column names:", df.columns)

# Verify the column names you intend to drop
print("Columns to drop:", list_drop)



In [None]:
X = df.drop(["status"], axis=1)
y = df["status"]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.preprocessing import StandardScaler

# Assuming 'X' is your DataFrame containing both numerical and non-numerical columns

# Select only the numerical columns from the DataFrame
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns

# Filter the DataFrame to keep only the numerical columns
X_numeric = X[numerical_columns]

# Initialize the StandardScaler
scaler = StandardScaler()

# Apply the StandardScaler to the numerical features
scaled_features = scaler.fit_transform(X_numeric)

# Create a DataFrame with the scaled features
scaled_df = pd.DataFrame(scaled_features, columns=numerical_columns)

# Now 'scaled_df' contains your scaled numerical features
print(scaled_df)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Assuming you have a DataFrame 'df' with your features and a 'y' array for labels
# And assuming you have performed the necessary preprocessing including scaling

# Selecting the features and labels
X = df.drop(["name", "status"], axis=1)
y = df["status"]

# Assuming you have performed feature scaling on 'X' using the StandardScaler
scaler = StandardScaler()
feature = scaler.fit_transform(X)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(feature, y, test_size=0.3, random_state=42)

# Now you can use X_train and other variables as intended
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


In [None]:
y_train.head()

In [None]:
X_test

In [None]:
y_test.head()

In [None]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
print(len(y_train[y_train==1]), len(y_train[y_train==0]))

In [None]:
pip install imbalanced-learn

In [None]:
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split


In [None]:
class_0_indices = np.where(y_train == 0)[0]
class_1_indices = np.where(y_train == 1)[0]


In [None]:
min_class_size = min(len(class_0_indices), len(class_1_indices))


In [None]:
def lr_grid_search(X, y):
    model = LogisticRegression()
    
    # Create a dictionary of all values we want to test
    solvers = ['newton-cg', 'lbfgs', 'liblinear']
    penalty = ['l2']
    c_values = [100, 10, 1.0, 0.1, 0.01]
    
    # define grid search
    param_grid = dict(solver=solvers, penalty=penalty, C=c_values)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='accuracy')
    grid_result = grid_search.fit(X, y)
    
    return grid_result.best_params_

In [None]:
lr_grid_search(X_train, y_train)

In [None]:
lr = LogisticRegression(C=0.01, penalty='l2', solver='liblinear')
lr.fit(X_train,y_train)

y_pred_lr = lr.predict(X_test)

print(metrics.classification_report(y_test, y_pred_lr))

lr_score = lr.score(X_train,y_train)
print(lr_score)

lr_score = lr.score(X_test,y_test)
print(lr_score)

In [None]:
lr_tacc = lr.score(X_test,y_test)
lr_train_acc = lr.score(X_train, y_train)

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred_lr, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

In [None]:
y_pred_proba = lr.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
fpr 
tpr

auc = metrics.roc_auc_score(y_test, y_pred_proba)
auc

plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

In [None]:
lr_auc = auc
lr_auc

In [None]:
def dtree_grid_search(X, y):
    #create a dictionary of all values we want to test
    param_grid = { 'criterion':['gini','entropy'],'max_depth': np.arange(2, 15)}
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    
    # decision tree model
    dtree = DecisionTreeClassifier()
    
    #use gridsearch to test all values
    dtree_gscv = GridSearchCV(dtree, param_grid, cv=cv, n_jobs=-1, scoring='accuracy')
    #fit model to data
    dtree_gscv.fit(X, y)
    
    return dtree_gscv.best_params_

In [None]:
dTree = DecisionTreeClassifier(criterion = 'entropy', max_depth = 2)
dTree.fit(X_train, y_train)

print(dTree.score(X_train,y_train))
print(dTree.score(X_test,y_test))

y_pred_dtree = dTree.predict(X_test)

print(metrics.classification_report(y_test, y_pred_dtree))

In [None]:
dt_tacc = dTree.score(X_test,y_test)
dt_train_acc = dTree.score(X_train, y_train)

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred_dtree, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

In [None]:
y_pred_proba = dTree.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
fpr 
tpr

auc = metrics.roc_auc_score(y_test, y_pred_proba)
auc

plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

In [None]:
dt_auc = auc
dt_auc

In [None]:
def ada_grid_search(X, y):
    #create a dictionary of all values we want to test
    param_grid = {'n_estimators':[10, 50, 100, 500], 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0]}
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    
    # AdaBoost model
    ada = AdaBoostClassifier()
    
    # Use gridsearch to test all values
    ada_gscv = GridSearchCV(ada, param_grid, n_jobs=-1, cv=cv, scoring='accuracy')
    #fit model to data
    grid_result = ada_gscv.fit(X, y)
    
    return ada_gscv.best_params_

In [None]:
ada_grid_search(X_train, y_train)

In [None]:
abcl = AdaBoostClassifier(n_estimators=15, learning_rate = 0.01)
abcl = abcl.fit(X_train, y_train)

y_pred_abcl = abcl.predict(X_test)

print(abcl.score(X_train, y_train))
print(abcl.score(X_test,y_test))

print(metrics.classification_report(y_test, y_pred_abcl))

In [None]:
ada_train_acc = abcl.score(X_train, y_train)
ada_tacc = abcl.score(X_test,y_test)

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred_abcl, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

In [None]:
y_pred_proba = abcl.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
fpr 
tpr

auc = metrics.roc_auc_score(y_test, y_pred_proba)
auc

plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

In [None]:
ada_auc = auc
ada_auc