## PROJECT TO PREDICT MODEL FOR EASY VISA DATASET 

## Phase 1 : Data Collection and Preparation
Task 1.1: Load the dataset into a pandas DataFrame

DATA INGESTION

In [None]:
# Importing of necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.calibration import LabelEncoder

: 

In [None]:
# Load the dataset
url= r"https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/EasyVisa%20(1).csv"

In [None]:
# Load the dataset from pandas
data = pd.read_csv(url)
data.head(5)

PREMILARY DATA ANALYSIS

In [None]:
# Inspect the dataset for missing values and handle them appropriately
data.shape
# From the dataset there are 25480 rows and 12 columns

In [None]:
data.isnull().sum()

# From the dataset this indicate that there is no missing value in the dataset

In [None]:
data.columns
# This reveals all the columns title we have in the dataset

In [None]:
data.duplicated().sum()
# this indicate that the dataset has no duplicate

In [None]:
data.drop(columns=["case_id"], inplace=True)
data.head()
# data.head()

## Phase 2: EXPLORATORY DATA ANALYSIS (EDA)

Descriptive Data Analysis

In [None]:
data.describe().round(2).sum()

In [None]:
data.info()
# From the dataset we have 25480 of rows and 9 categorical columns, 2 numerical columns with (int64(2)) and 1 numerical columns with float(64)

# Exploratory Data Analysis

Understanding the dataset and checking out for the unquie values in the dataset

In [None]:
# making a list of all categorical columns in the  dataset
cat_col = list(data.select_dtypes("object").columns)
for col in cat_col:
    print(data[col].value_counts())
    print("--"*50)

## Univariate Analysis

In [None]:
def histogram_boxplot(data, feature, figsize=(12, 8), kde=False, bins=None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (15,10))
    kde: whether to show the density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid= 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    sns.boxplot(
        data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
    )  # boxplot will be created and a triangle will indicate the mean value of the column
    sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins
    ) if bins else sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color="green", linestyle="--"
    )  # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color="black", linestyle="-"
    )  # Add median to the histogram
    plt.tight_layout()
    plt.show()

In [None]:
data.head(5)

In [None]:
# Create a copy for preprocessing
df_processed = data.copy()



# 1. Check for missing values (EDA showed no missing values)
print("\n1. Missing Values:")
missing_values = data.isnull().sum()
if missing_values.sum() > 0:
    print(missing_values[missing_values > 0])
else:
    print("No missing values found (as expected from EDA)")

# 2. Check for duplicates
print("\n2. Duplicate Rows:")
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
if duplicates > 0:
    print(f"Percentage of duplicates: {(duplicates/len(data))*100:.2f}%")

# 3. Check skewness for variables identified in EDA as right-skewed
print("\n3. Skewness Analysis (EDA identified right-skewed variables):")
skewed_vars = ['no_of_employees', 'prevailing_wage', 'yr_of_estab']
for var in skewed_vars:
    if var in data.columns:
        skewness = data[var].skew()
        print(f"{var}: skewness = {skewness:.3f} ({'right-skewed' if skewness > 0.5 else 'approximately normal'})")

In [None]:
num_features=data.select_dtypes(include=['number']).columns
num_features

In [None]:
for col in num_features:
    print(f"\histogram_boxplot{col}")
    histogram_boxplot(data, col, kde=True, bins=20)



# Identify and handle outliers in the dataset.
# From the diagram above, findings from univariate data analysis most data are skewed and there are possible outliers 


In [None]:
cat_features=data.select_dtypes(include=['object']).columns
cat_features

In [None]:
# function to create labeled barplots


def labeled_barplot(data, feature, perc=False, n=None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])  # length of the column
    count = data[feature].nunique()
    if n is None:
        plt.figure(figsize=(count + 2, 6))
    else:
        plt.figure(figsize=(n + 2, 6))

    plt.xticks(rotation=90, fontsize=15)
    ax = sns.countplot(
        data=data,
        x=feature,
        palette="Paired",
        order=data[feature].value_counts().index[:n],
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )  # percentage of each class of the category
        else:
            label = p.get_height()  # count of each level of the category

        x = p.get_x() + p.get_width() / 2  # width of the plot
        y = p.get_height()  # height of the plot

        ax.annotate(
            label,
            (x, y),
            ha="center",
            va="center",
            size=12,
            xytext=(0, 5),
            textcoords="offset points",
        )  # annotate the percentage

    plt.show()  # show the plot

In [None]:
labeled_barplot(data, "continent", perc=False, n=None)
    

    

    

In [None]:
data.head()

In [None]:
labeled_barplot(data, "education_of_employee", perc=False, n=None)
labeled_barplot(data, "has_job_experience", perc=False, n=None)
labeled_barplot(data, "requires_job_training", perc=False, n=None)
labeled_barplot(data, "region_of_employment", perc=False, n=None)
labeled_barplot(data, "unit_of_wage", perc=False, n=None)
labeled_barplot(data, "case_status", perc=False, n=None)
labeled_barplot(data, "full_time_position", perc=False, n=None)




    
    

## Bivariate Analsis

Numerical_col

In [None]:
col_list = data.select_dtypes(include=np.number).columns.tolist()

plt.figure(figsize=(12, 8))
sns.heatmap(
    data[col_list].corr(), annot=True, vmin=1, fmt=".2f", cmap="Accent"
)
plt.show()

In [None]:
### function to plot distributions wrt target


def distribution_plot_wrt_target(data, predictor, target):

    fig, axs = plt.subplots(2, 2, figsize=(12, 10))

    target_uniq = data[target].unique()

    axs[0, 0].set_title("Distribution of target for target=" + str(target_uniq[0]))
    sns.histplot(
        data=data[data[target] == target_uniq[0]],
        x=predictor,
        kde=True,
        ax=axs[0, 0],
        color="teal",
        stat="density",
    )

    axs[0, 1].set_title("Distribution of target for target=" + str(target_uniq[1]))
    sns.histplot(
        data=data[data[target] == target_uniq[1]],
        x=predictor,
        kde=True,
        ax=axs[0, 1],
        color="orange",
        stat="density",
    )

    axs[1, 0].set_title("Boxplot w.r.t target")
    sns.boxplot(data=data, x=target, y=predictor, ax=axs[1, 0], palette="gist_rainbow")

    axs[1, 1].set_title("Boxplot (without outliers) w.r.t target")
    sns.boxplot(
        data=data,
        x=target,
        y=predictor,
        ax=axs[1, 1],
        showfliers=False,
        palette="gist_rainbow",
    )

    plt.tight_layout()
    plt.show()

In [None]:
for col in num_features:
    print(f"\distribution_plot_wrt_target{col}")
    distribution_plot_wrt_target(data, col, target="case_status" )

Categorical_Feature

In [None]:
def stacked_barplot(data, predictor, target):
    """
    Print the category counts and plot a stacked bar chart

    data: dataframe
    predictor: independent variable
    target: target variable
    """
    count = data[predictor].nunique()
    sorter = data[target].value_counts().index[-1]
    tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
        by=sorter, ascending=False
    )
    print(tab1)
    print("-" * 120)
    tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
        by=sorter, ascending=False
    )
    tab.plot(kind="bar", stacked=True, figsize=(count + 5, 5))
    plt.legend(
        loc="lower left", frameon=False,
    )
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
    plt.show()

In [None]:
for col in cat_features:
    print(f"\stacked_barplot{col}")
    stacked_barplot(data, col, target="case_status" )

In [None]:
num_features =data.select_dtypes(include=[np.number]).columns
num_features
for col in num_features:
    print(f'{col} = {data[col].skew():.3f}"  skewness value')

In [None]:
def bivariate_num_cat(data1, feature, target='case_'):
    """
    Plots and summarizes relationship between a numerical feature and a categorical target.
    """
    print(f" Feature: {feature} vs {target}")
    print("="*50)

    # Group summary
    summary = data1.groupby(target)[feature].describe()[['mean','std','min','max']]
    print(summary)
    print()

    plt.figure(figsize=(10,5))

    # Boxplot + mean line
    sns.boxplot(x=target, y=feature, data=data1, palette='Set2')
    plt.title(f'{feature} across {target} categories', fontsize=13)
    plt.xlabel(target)
    plt.ylabel(feature)
    plt.show()

In [None]:
# for col in num_features:
#     for col2 in cat_features:
#         if col2 =="case_Status":
            # bivariate_num_cat(data, col, col2)

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(data=data, x="continent", y="yr_of_estab")
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(data=data, x="no_of_employees", y="case_status")
plt.show()

In [None]:
# Create a copy for preprocessing
df_processed = data.copy()



# 1. Check for missing values (EDA showed no missing values)
print("\n1. Missing Values:")
missing_values = data.isnull().sum()
if missing_values.sum() > 0:
    print(missing_values[missing_values > 0])
else:
    print("No missing values found (as expected from EDA)")

# 2. Check for duplicates
print("\n2. Duplicate Rows:")
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
if duplicates > 0:
    print(f"Percentage of duplicates: {(duplicates/len(data))*100:.2f}%")

# 3. Check skewness for variables identified in EDA as right-skewed
print("\n3. Skewness Analysis (EDA identified right-skewed variables):")
skewed_vars = ['no_of_employees', 'prevailing_wage', 'yr_of_estab']
for var in skewed_vars:
    if var in data.columns:
        skewness = data[var].skew()
        print(f"{var}: skewness = {skewness:.3f} ({'right-skewed' if skewness > 0.5 else 'approximately normal'})")



In [None]:



def temp_encode_for_correlation_check(df, target_col, figsize=(10,2)):
    """
    Plots a heatmap showing correlation of each numeric feature against target_col.
    Returns a Series of correlations (sorted by absolute magnitude).
    """
    # make a temporary copy
    df_temp = df.copy()
    # Encode categorical columns temporarily
    label_encoders = {}
    for col in df_temp.select_dtypes(include=["object", "category"]).columns:
        le = LabelEncoder()
        df_temp[col] = le.fit_transform(df_temp[col].astype(str))
        label_encoders[col] = le
    corrs = df_temp.corrwith(df_temp[target_col]).drop(target_col)
    corrs_df = corrs.to_frame(name='corr').T  # shape (1, n)
    plt.figure(figsize=figsize)
    sns.heatmap(
        corrs_df,
        annot=True,
        fmt=".3f",
        cmap="coolwarm",
        center=0,
        vmin=-1,
        vmax=1,
        cbar_kws={'orientation': 'vertical', 'shrink':0.7}
    )
    plt.xticks(rotation=45, ha='right')
    plt.yticks([0], [target_col], rotation=0)
    plt.title(f'Correlation of numeric features with {target_col}', fontsize=12)
    plt.tight_layout()
    plt.show()
    # return sorted correlations for downstream use
    return corrs.reindex(corrs.abs().sort_values(ascending=False).index)

In [None]:
temp_encode_for_correlation_check(data,'case_status', figsize=(10,3))

In [None]:
def correlation_with_target(data, target_col, figsize=(10,2)):
    """
    Plots a heatmap showing correlation of each numeric feature against target_col.
    Returns a series of correlations (sorted by absolute magnitude).
    """
    # Keep only numeric columns
    df_duplicate= data.copy()


    # compute correlations of every numeric column with the target
    corrs = df_duplicate.corrwith(df_duplicate[target_col]).drop(target_col)

    corrs_df = corrs.to_frame(name='corr').T  # shape (1, n)

    plt.figure(figsize=figsize)
    sns.heatmap(
        corrs_df,
        annot=True,
        fmt=".3f",
        cmap="coolwarm",
        center=0,
        vmin=-1,
        vmax=1,
        cbar_kws={'orientation': 'vertical', 'shrink':0.7}
    )
    plt.xticks(rotation=45, ha='right')
    plt.yticks([0], [target_col], rotation=0)
    plt.title(f'Correlation of numeric features with {target_col}', fontsize=12)
    plt.tight_layout()
    plt.show()

    # return sorted correlations for downstream use
    return corrs.reindex(corrs.abs().sort_values(ascending=False).index)


In [None]:
data.to_csv("cleaned_Visa_dataset.csv", index=False)