# Project : H1-B Prediction Analysis

## Aim :
To predict the Case Status of the Visa application based on 6 year application data

## Prediction Flow

1. Import the required libraries.
2. Understanding the data
3. Cleaning the Data
4. Combined SOC_NAMEs together for a common SOC_NAME_NEW
5. EDA on the top companies with more number of applications and average salaries
6. Identify Feature Importances.
7. Applying LOgistic Regression, Decision Tree and Random Forest Classifier, GaussianNB, MLP Classifier, GradientBoostingClassifier

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from collections import Counter as c  
from matplotlib.pyplot import plot  
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix


In [None]:
df=pd.read_csv("h1b_kaggle.csv")
df.shape

In [None]:
df.head()

In [None]:
df.describe()
"""Using df.describe() can be a quick way to get an idea of the distribution of data in a DataFrame and to identify potential issues such as missing data, outliers, and extreme values.
It can also provide a starting point for more in-depth analysis and visualization of the data."""

In [None]:
df.info()

In [None]:
df.isna().sum()

"""lon, lat and SOC_NAME has the highest number of null values. 
SOC_NAME stands for "Standard Occupational Classification Name." It is a column in certain datasets 
that contain information about workers in the United States.
The SOC system is used by the U.S. government to classify and track various types of jobs in the labor market.
The SOC system defines over 800 detailed occupations, which are organized into 23 major groups.
Each occupation is assigned a unique code and a corresponding job title."""

In [None]:
print(df['YEAR'].unique())    #Looking for the distinct years for the analysis

## Data Cleaning

In [None]:
df['EMPLOYER_NAME'] = df['EMPLOYER_NAME'].fillna('Unknown')

In [None]:
df['YEAR']

In [None]:
df['YEAR'] = pd.to_numeric(df['YEAR'], errors='coerce')

# find median of two columns
median = np.nanmedian(df['YEAR'])

# replace NaN values with median
df['YEAR'] = df['YEAR'].fillna(median)

### Updating the target variable into binary output

In [None]:
df['CASE_STATUS'].value_counts() # I want the target variable to contain two simple values Certified or Denied

In [None]:
df = df.drop(df[df['CASE_STATUS'] == 'WITHDRAWN'].index)
df['CASE_STATUS'] = df['CASE_STATUS'].replace({'CERTIFIED-WITHDRAWN': 'CERTIFIED',
                                               'REJECTED': 'DENIED',
                                               'INVALIDATED': 'DENIED',
                                               'PENDING QUALITY AND COMPLIANCE REVIEW - UNASSIGNED': 'DENIED'})


In [None]:
df['SOC_NAME'] = df['SOC_NAME'].fillna('others')

In [None]:
import sys
df['SOC_NAME_NEW'] = 'others'
df['SOC_NAME_NEW'][df['SOC_NAME'].str.contains('CHIEF|EXECUTIVES')] = 'Executives'
df['SOC_NAME_NEW'][df['SOC_NAME'].str.contains('Computer|Software|Developer|Cloud|Cybersecurity|Application')] = 'IT'
df['SOC_NAME_NEW'][df['SOC_NAME'].str.contains('Chief|Management|MANAGERS')] = 'Manager'
df['SOC_NAME_NEW'][df['SOC_NAME'].str.contains('Mechanical|Automotive|Mechatronics')] = 'Mechanical'
df['SOC_NAME_NEW'][df['SOC_NAME'].str.contains('Sales|Market')] = 'Sales & Market'
df['SOC_NAME_NEW'][df['SOC_NAME'].str.contains('FINANCIAL|Capitalist|Banker')] = 'Finance'
df['SOC_NAME_NEW'][df['SOC_NAME'].str.contains('Public|Fundraising')] = 'P.R'
df['SOC_NAME_NEW'][df['SOC_NAME'].str.contains('education|law')] = 'Administrative'
df['SOC_NAME_NEW'][df['SOC_NAME'].str.contains('Auditors|Compliance')] = 'Audit'
df['SOC_NAME_NEW'][df['SOC_NAME'].str.contains('Recruiters|Human')] = 'H.R'
df['SOC_NAME_NEW'][df['SOC_NAME'].str.contains('Agricultural|Farm|Horticultural|Cultivation')] = 'Agriculture'
df['SOC_NAME_NEW'][df['SOC_NAME'].str.contains('Construction|Architectural')] = 'Estate'
df['SOC_NAME_NEW'][df['SOC_NAME'].str.contains('Forencsic|Health|Doctor|Medicine')] = 'Medical'
df['SOC_NAME_NEW'][df['SOC_NAME'].str.contains('teachers|Professor')] = 'Education'
# The str.contains() method used in the code is case-insensitive by default. 
# This means that it will match strings regardless of the case of the letters.

In [None]:
df['CASE_STATUS'].fillna(df['CASE_STATUS'].mode().iloc[0],inplace=True)
df['SOC_NAME'].fillna(df['SOC_NAME'].mode().iloc[0],inplace=True)
df['FULL_TIME_POSITION'].fillna(df['FULL_TIME_POSITION'].mode().iloc[0],inplace=True)
df['YEAR'].fillna(df['YEAR'].mode().iloc[0],inplace=True)

In [None]:
df['PREVAILING_WAGE'].fillna(df['PREVAILING_WAGE'].median(),inplace=True)
print(df['CASE_STATUS'].unique())
print(df['YEAR'].unique())
print(df['FULL_TIME_POSITION'].unique())

In [None]:
df.isna().sum()

# Plotting/ Data Visualization

## Top 10 Applicants in 2016

In [None]:
import plotly.express as px

df2016 = df[df['YEAR'] == 2016]
top_10_emp = df2016['EMPLOYER_NAME'].value_counts().head(10).reset_index() # Top 10 Employers
top_10_emp.columns = ['EMPLOYER_NAME', 'Freq']
top_10_emp = top_10_emp.sort_values(by='Freq', ascending=True) # sort values in descending order


fig = px.bar(
    data_frame=top_10_emp,
    y="EMPLOYER_NAME",
    x="Freq",
    orientation="h",
    color_discrete_sequence=px.colors.qualitative.Pastel,
    height=700,
    labels={"Freq": "Frequency", "EMPLOYER_NAME": "Employer Name"},
    title="Top 10 Applicants in 2016"
)

fig.update_traces(marker_line_width=1)
fig.update_layout(showlegend=False)
fig.show()


## Top 10 Applicants in 2015

In [None]:
import plotly.express as px

df2015 = df[df['YEAR'] == 2015]
top_10_emp = df2015['EMPLOYER_NAME'].value_counts().head(10).reset_index() # Top 10 Employers
top_10_emp.columns = ['EMPLOYER_NAME', 'Freq']
top_10_emp = top_10_emp.sort_values(by='Freq', ascending=True) # sort values in descending order


fig = px.bar(
    data_frame=top_10_emp,
    y="EMPLOYER_NAME",
    x="Freq",
    orientation="h",
    color_discrete_sequence=px.colors.qualitative.Pastel,
    height=700,
    labels={"Freq": "Frequency", "EMPLOYER_NAME": "Employer Name"},
    title="Top 10 Applicants in 2015"
)

fig.update_traces(marker_line_width=1)
fig.update_layout(showlegend=False)
fig.show()


## Top 10 Applicants in 2014

In [None]:
import plotly.express as px

df2014 = df[df['YEAR'] == 2014]
top_10_emp = df2014['EMPLOYER_NAME'].value_counts().head(10).reset_index() # Top 10 Employers
top_10_emp.columns = ['EMPLOYER_NAME', 'Freq']
top_10_emp = top_10_emp.sort_values(by='Freq', ascending=True) # sort values in descending order


fig = px.bar(
    data_frame=top_10_emp,
    y="EMPLOYER_NAME",
    x="Freq",
    orientation="h",
    color_discrete_sequence=px.colors.qualitative.Pastel,
    height=700,
    labels={"Freq": "Frequency", "EMPLOYER_NAME": "Employer Name"},
    title="Top 10 Applicants in 2014"
)

fig.update_traces(marker_line_width=1)
fig.update_layout(showlegend=False)
fig.show()


In [None]:
top_emp = list(df['EMPLOYER_NAME'][df['YEAR'] >= 2015].groupby(df['EMPLOYER_NAME']).count().sort_values(ascending=False).head(10).index)


In [None]:
top_emp

In [None]:
byempyear = df[['EMPLOYER_NAME', 'YEAR', 'PREVAILING_WAGE']][df['EMPLOYER_NAME'].isin(top_emp)].groupby([df['EMPLOYER_NAME'], df['YEAR']])


In [None]:
byempyear

# Number of Applications of Top 10 Applicants

In [None]:
import plotly.graph_objects as go

markers=['circle','square','diamond','cross','x','triangle-up','triangle-down','triangle-left','triangle-right','star']

fig = go.Figure()

for company in top_emp:
    tmp = byempyear.count().loc[company]
    fig.add_trace(go.Scatter(x=tmp.index.values, y=tmp["PREVAILING_WAGE"].values, 
                              name=company, mode='lines+markers', marker=dict(symbol=markers[top_emp.index(company)], size=10),
                              line=dict(width=2)))

fig.update_layout(xaxis_title="Year", yaxis_title="Number of Applications",
                  title_text="Number of Applications of Top 10 Applicants")
fig.show()


# Average Salary of Top 10 Applicants

In [None]:
import plotly.graph_objects as go

markers=['circle','square','diamond','cross','x','triangle-up','triangle-down','triangle-left','triangle-right','star']

fig = go.Figure()

for company in top_emp:
    tmp = byempyear.mean().loc[company]
    fig.add_trace(go.Scatter(x=tmp.index.values, y=tmp["PREVAILING_WAGE"].values, 
                              name=company, mode='lines+markers', marker=dict(symbol=markers[top_emp.index(company)], size=10),
                              line=dict(width=2)))

fig.update_layout(xaxis_title="Year", yaxis_title="Average Salary offered (USD)",
                  title_text="Average Salary of Top 10 Applicants")
fig.show()


## Number of Applications made for the Full Time Position

In [None]:
import plotly.express as px

fig = px.histogram(df, x='FULL_TIME_POSITION', title='NUMBER OF APPLICATIONS MADE FOR THE FULL TIME POSITION')
fig.update_xaxes(title='FULL TIME POSITION')
fig.update_yaxes(title='NUMBER OF PETITIONS MADE')
fig.update_traces(marker=dict(color='rgb(158,222,225)', line=dict(color='black', width=1.5)))
fig.show()


In [None]:
df.drop(['Unnamed: 0', 'EMPLOYER_NAME','JOB_TITLE','WORKSITE', 'lon','lat'], axis = 1,inplace=True)

In [None]:
df.head(5)

In [None]:
df.isnull().any()

In [None]:
df.CASE_STATUS.value_counts()

## Target variable values

In [None]:
import plotly.graph_objects as go

df_count = df['CASE_STATUS'].value_counts()

fig = go.Figure(data=[go.Bar(y=df_count.index, x=df_count.values, orientation='h', 
                             marker=dict(color='rgb(158,222,225)', line=dict(color='black', width=1.5)))])
fig.update_layout(title="Target variable values", yaxis_title="CASE STATUS", xaxis_title="NUMBER OF PETITIONS MADE")
fig.show()


## Number of applications made per Year

In [None]:
import plotly.graph_objects as go

df_year_count = df['YEAR'].value_counts()

fig = go.Figure(data=[go.Bar(x=df_year_count.index, y=df_year_count.values, 
                             marker=dict(color='rgb(158,222,225)', line=dict(color='rgb(8,48,107)', width=1.5)))])
fig.update_layout(title="NUMBER OF PETITIONS MADE PER YEAR", xaxis_title="YEAR", yaxis_title="NUMBER OF PETITIONS MADE")
fig.show()


In [None]:
df['CASE_STATUS_tar'] = df['CASE_STATUS'].map({'CERTIFIED' : 1,  'DENIED' : 0})


In [None]:
df['CASE_STATUS'].value_counts()

In [None]:
df['FULL_TIME_POSITION'] = df['FULL_TIME_POSITION'].map({'N' : 0, 'Y' : 1})
df.head()

In [None]:
df['SOC_NAME_NEW'].unique()

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df.SOC_NAME_NEW)
df['SOC_N']=le.transform(df['SOC_NAME_NEW']) 
#Convert the categorical variable "SOC_NAME_NEW" in the DataFrame "df" into numerical labels

In [None]:
group = df.groupby('SOC_NAME_NEW')
df2 = group.apply(lambda x: x['SOC_N'].unique())
df2

In [None]:
df = df.drop(['SOC_NAME','SOC_NAME_NEW','CASE_STATUS'], axis=1)
df = df.rename(columns={'CASE_STATUS_tar': 'CASE_STATUS'})

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr(), annot=True, cmap="RdBu", annot_kws={"size":9},linewidth=1)

In [None]:
featurecols =['FULL_TIME_POSITION','PREVAILING_WAGE','YEAR','SOC_N']

## Applying Logistic Regression

In [None]:
X = pd.DataFrame(df, columns=featurecols)
y = pd.DataFrame(df, columns=["CASE_STATUS"])

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
y_train = np.ravel(y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
LogReg = LogisticRegression()
LogReg.fit(x_train, y_train)
y_pred = LogReg.predict(x_test)

In [None]:
LogReg.score(x_test,y_test)

## Applying Z score method to remove outliers

In [None]:
import pandas as pd
from scipy import stats

# Define the feature columns
featurecols = ["FULL_TIME_POSITION", "PREVAILING_WAGE", "YEAR", "SOC_N"]

# Create X and y DataFrames
X = pd.DataFrame(df, columns=featurecols)
y = pd.DataFrame(df, columns=["CASE_STATUS"])

# Calculate the z-scores for each feature
z_scores = stats.zscore(X)

# Define the z-score threshold for outliers
z_thresh = 3

# Create a Boolean mask for rows with z-scores within the threshold
outlier_mask = (abs(z_scores) <= z_thresh).all(axis=1)

# Create a new DataFrame without outliers
X_no_outliers = X[outlier_mask]
y_no_outliers = y[outlier_mask]




In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_no_outliers, y_no_outliers, test_size = 0.2, random_state = 42)

In [None]:
y_train = np.ravel(y_train)

In [None]:
y_train.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
LogReg = LogisticRegression()
LogReg.fit(x_train, y_train)
y_pred = LogReg.predict(x_test)

In [None]:
LogReg.score(x_test,y_test)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(0.95)
X_pca = pca.fit_transform(X)
X_pca.shape

In [None]:

x_pca_train, x_pca_test, y_train, y_test = train_test_split(X_pca, y, test_size = 0.25, random_state = 42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
LogReg = LogisticRegression()
LogReg.fit(x_pca_train, y_train.values.ravel())
y_pred = LogReg.predict(x_pca_test)


In [None]:
LogReg.score(x_pca_test,y_test)

In [None]:
pca.n_components_

In [None]:
from sklearn.metrics import classification_report

# Evaluate the performance of the model
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix

In [None]:
# Import required libraries
import pandas as pd
from sklearn.utils import resample


# Separate majority and minority classes
df_majority = df[df.CASE_STATUS==1]
df_minority = df[df.CASE_STATUS==0]

# Upsample minority class using SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X, y)

# Upsample minority class using random oversampling
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),  # to match majority class
                                 random_state=42)  # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=True,    # sample without replacement
                                   n_samples=len(df_minority),  # to match minority class
                                   random_state=42)  # reproducible results

# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])


## After applying Upsampling

In [None]:
# Separate the features and target variable
X = df_upsampled.drop('CASE_STATUS', axis=1)
y = df_upsampled['CASE_STATUS']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the logistic regression model
lr = LogisticRegression()

# Train the model on the training set
lr.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = lr.predict(X_test)

# Evaluate the performance of the model using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix

## Applying downsampling

In [None]:
# Separate the features and target variable
X = df_downsampled.drop('CASE_STATUS', axis=1)
y = df_downsampled['CASE_STATUS']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the logistic regression model
lr = LogisticRegression()

# Train the model on the training set
lr.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = lr.predict(X_test)

# Evaluate the performance of the model using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,7))
sns.heatmap(confusion_matrix,annot=True)
plt.xlabel('y_pred')
plt.ylabel('y_test')
plt.show()


In [None]:
from sklearn.metrics import classification_report
# Separate the features and target variable
X = df_upsampled.drop('CASE_STATUS', axis=1)
y = df_upsampled['CASE_STATUS']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the logistic regression model
lr = LogisticRegression()

# Train the model on the training set
lr.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = lr.predict(X_test)

# Evaluate the performance of the model using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))

## Decision Tree Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Separate the features and target variable
X = df_upsampled.drop('CASE_STATUS', axis=1)
y = df_upsampled['CASE_STATUS']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a decision tree classifier on the training set
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_dt = model_dt.predict(X_test)

# Evaluate the performance of the model on the testing set
acc = accuracy_score(y_test, y_pred_dt)
prec = precision_score(y_test, y_pred_dt)
recall = recall_score(y_test, y_pred_dt)
f1 = f1_score(y_test, y_pred_dt)

print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)
print("Accuracy:", acc)
print(classification_report(y_test, y_pred_dt))

## Gradient Boosting Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Separate the features and target variable
X = df_upsampled.drop('CASE_STATUS', axis=1)
y = df_upsampled['CASE_STATUS']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Gradient Boosting classifier on the training set
model_gb = GradientBoostingClassifier()
model_gb.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_gb = model_gb.predict(X_test)

# Evaluate the performance of the model on the testing set
acc = accuracy_score(y_test, y_pred_gb)
prec = precision_score(y_test, y_pred_gb)
recall = recall_score(y_test, y_pred_gb)
f1 = f1_score(y_test, y_pred_gb)

print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)
print("Accuracy:", acc)

print(classification_report(y_test, y_pred_gb))

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Separate the features and target variable
X = df_upsampled.drop('CASE_STATUS', axis=1)
y = df_upsampled['CASE_STATUS']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier on the training set
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_rf = model_rf.predict(X_test)

# Evaluate the performance of the model on the testing set
acc = accuracy_score(y_test, y_pred_rf)
prec = precision_score(y_test, y_pred_rf)
recall = recall_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)

print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)
print("Accuracy:", acc)

print(classification_report(y_test, y_pred_rf))


## KNeighborsClassifier

In [None]:
# Separate the features and target variable
X = df_upsampled.drop('CASE_STATUS', axis=1)
y = df_upsampled['CASE_STATUS']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a KNN classifier object
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

# Train the KNN classifier
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred_knn = knn.predict(X_test)

# Evaluate the performance of the model using accuracy score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
acc = accuracy_score(y_test, y_pred_knn)
prec = precision_score(y_test, y_pred_knn)
recall = recall_score(y_test, y_pred_knn)
f1 = f1_score(y_test, y_pred_knn)
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)
print("Accuracy:", acc)
print(classification_report(y_test, y_pred_knn))

## Gaussian NB 

In [None]:
# Separate the features and target variable
X = df_upsampled.drop('CASE_STATUS', axis=1)
y = df_upsampled['CASE_STATUS']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Naive Bayes classifier object
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

# Train the Naive Bayes classifier
nb.fit(X_train, y_train)

# Make predictions on the test set
y_pred_nb = nb.predict(X_test)

# Evaluate the performance of the model using accuracy score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
acc = accuracy_score(y_test, y_pred_nb)
prec = precision_score(y_test, y_pred_nb)
recall = recall_score(y_test, y_pred_nb)
f1 = f1_score(y_test, y_pred_nb)
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)
print("Accuracy:", acc)

print(classification_report(y_test, y_pred_nb))

## AdaBoost Classifier

In [None]:
# Separate the features and target variable
X = df_upsampled.drop('CASE_STATUS', axis=1)
y = df_upsampled['CASE_STATUS']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an AdaBoost classifier object
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=100, random_state=42)

# Train the AdaBoost classifier
ada.fit(X_train, y_train)

# Make predictions on the test set
y_pred_ada = ada.predict(X_test)

# Evaluate the performance of the model using accuracy score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
acc = accuracy_score(y_test, y_pred_ada)
prec = precision_score(y_test, y_pred_ada)
recall = recall_score(y_test, y_pred_ada)
f1 = f1_score(y_test, y_pred_ada)
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)
print("Accuracy:", acc)

print(classification_report(y_test, y_pred_ada))

## XG Boost Classifier

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Separate the features and target variable
X = df_upsampled.drop('CASE_STATUS', axis=1)
y = df_upsampled['CASE_STATUS']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an XGBoost classifier on the training set
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_xgb = model_xgb.predict(X_test)

# Evaluate the performance of the model on the testing set
acc = accuracy_score(y_test, y_pred_xgb)
prec = precision_score(y_test, y_pred_xgb)
recall = recall_score(y_test, y_pred_xgb)
f1 = f1_score(y_test, y_pred_xgb)

print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)
print("Accuracy:", acc)

print(classification_report(y_test, y_pred_xgb))

# Stacking

### The key idea behind Stacking is to leverage the strengths of each individual model and to reduce their weaknesses by combining their predictions. This can lead to a more accurate and robust predictive model than any single model on its own.

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier


# Define the base models
base_models = [
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
   
]

# Define the meta-model
meta_model = LogisticRegression()

# Create the stacking classifier
stacking = StackingClassifier(estimators=base_models, final_estimator=meta_model)

# Train the stacking classifier
stacking.fit(X_train, y_train)

# Make predictions on the test data
y_pred = stacking.predict(X_test)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
