In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
import ipywidgets as widgets
from IPython.display import display

import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

In [None]:
# !pip install --upgrade jupyterlab ipywidgets


In [None]:
# !jupyter lab build


In [None]:
df = pd.read_csv('data.csv', encoding='latin1')
# we will copy the dataframe for some actions with 'Investors column you can see it below'
df2=df.copy()

In [None]:
df.info()
#we are dealing with high dimensional data, based on this I create a custom info method and customize the describe method to "summary statistics"

In [None]:
def custom_info(df):
    print(f"DataFrame shape: {df.shape}")
    for i, col in enumerate(df.columns):
        non_null_count = len(df)-df[col].isna().sum()
        dtype = df[col].dtype
        print(f" {i}    {col:<50} {non_null_count} non-null    {dtype}")
    
def summary_statistics(df):
    # Summary statistics for numeric columns
    print("Summary statistics for numeric columns:")
    print(df.describe())
    
    # Unique value counts for object (categorical) columns
    print("\nUnique value counts for object (categorical) columns:")
    for column in df.select_dtypes(include=['object']).columns:
        print(f"\nColumn: {column}")
        print(f"{df[column].nunique()} unique values")
        print(df[column].value_counts())

def find_weak_columns(df, threshold=0.5):
    """
    Find columns in a DataFrame with more than a specified threshold of missing values.

    Parameters:
    df (DataFrame): The DataFrame to analyze.
    threshold (float): The threshold for the proportion of missing values (default is 0.5).

    Returns:
    list: A list of column names with more than the specified threshold of missing values.
    """
    columns_to_drop = []
    total_rows = len(df)
    
    for column in df.columns:
        missing_values = df[column].isna().sum()
        missing_ratio = missing_values / total_rows
        if missing_ratio > threshold:
            columns_to_drop.append(column)
    
    return columns_to_drop

def count_nan_values(df, features):
    nan_counts = {}
    for feature in features:
        nan_count = df[feature].isna().sum()
        nan_counts[feature] = nan_count
    
    # Sort the nan_counts dictionary by values in descending order
    nan_counts = dict(sorted(nan_counts.items(), key=lambda x: x[1], reverse=True))
    
    return nan_counts

def droping(df):
    df.drop(columns=(['Est. Founding Date']+['Company_Name','Short Description of company profile',
                                            'Specialization of highest education','Investors','Industry of company']
                                          +['Employees per year of company existence','Last round of funding received (in milionUSD)',
                                          'Time to 1st investment (in months)',
                                            'Experience in Fortune 100 organizations','Experience in Fortune 1000 organizations']+
                                           ['Last Funding Date']),inplace=True)
    df.drop(columns=find_weak_columns(df),inplace=True)

In [None]:
custom_info(df)

#we see that we have many columns with a high content of missing values, so in the droping function we will remove all columns with more then 50% missing values

In [None]:
df.head()

1. **we see that in columns except NAN there is NO_INFO instead of missing values, we will replace it with NAN**
2. **we see that there are visually many more numeric columns here than CUSTOM_INFO showed, based on this we will group all these columns into a variable and change the data types of these columns to numeric ones**
3. **we will also remove 'Est. Founding Date' because leaving this and 'year of founding' we will have leakage. we remove concretely this one because it has more missing values**

In [None]:
df.replace("No Info", np.nan, inplace=True)
numeric_visual=['Age of company in years','Internet Activity Score','Employee Count','year of founding','Employees count MoM change','Last Funding Amount',
         'Number of Investors in Seed','Number of Investors in Angel and or VC','Number of Co-founders','Number of of advisors',
         'Team size Senior leadership','Team size all employees','Number of of repeat investors','Years of education',
         'Renowned in professional circle','Number of Recognitions for Founders and Co-founders','Skills score',
         'google page rank of company website','Industry trend in investing','Number of Direct competitors',
         'Employees per year of company existence','Last round of funding received (in milionUSD)','Time to 1st investment (in months)',
         'Avg time to investment - average across all rounds, measured from previous investment','Percent_skill_Entrepreneurship',
         'Percent_skill_Operations','Percent_skill_Engineering','Percent_skill_Marketing','Percent_skill_Leadership',
         'Percent_skill_Data Science','Percent_skill_Business Strategy','Percent_skill_Product Management','Percent_skill_Sales',
         'Percent_skill_Domain','Percent_skill_Law','Percent_skill_Consulting','Percent_skill_Finance','Percent_skill_Investment',
         'Renown score','Experience in Fortune 500 organizations','Experience in Fortune 100 organizations',
         'Experience in Fortune 1000 organizations']
numeric_columns = []
    # Iterate through columns in the DataFrame
for col in df.columns:
    # Check if the data type of the column is numeric
    if pd.api.types.is_numeric_dtype(df[col]):
        # If the data type is numeric, add the column name to the list
        numeric_columns.append(col)
        # aystex arden stanum enq arden isk numericner@ 
extracted_columns = [column for column in numeric_visual if column not in numeric_columns]
for col in extracted_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [None]:
summary_statistics(df)

looking at the unique values ​​of columns with string types in summary_statistics(), it is noticeable that there are the same values ​​with capital and small letters, so we will capitalize

In [None]:
# Iterate over columns
for col in df.select_dtypes(include='object').columns:
    # Capitalize data points in object columns
    df[col] = df[col].str.capitalize()

In [None]:
custom_info(df)

In [None]:
summary_statistics(df)

# Now we will investigate High Cardinality issues

In [None]:
def vis_un_val_cat_var(df):
    """
    Visualizes the number of unique values for each categorical variable
    in descending order using a histogram.
    """
    # Filter categorical variables
    categorical_vars = df.select_dtypes(include=['object'])
    
    # Count unique values for each categorical variable
    unique_value_counts = categorical_vars.nunique().sort_values(ascending=False)
    
    # Plot histogram
    plt.figure(figsize=(10, 6))
    unique_value_counts.plot(kind='bar')
    plt.title('Number of Unique Values for Categorical Variables')
    plt.xlabel('Categorical Variables')
    plt.ylabel('Number of Unique Values')
    # plt.xticks(rotation=45, ha='right')
    plt.show()
vis_un_val_cat_var(df)

In [None]:
categorical_vars = df.select_dtypes(include=['object'])
first_7_unique_value_counts = categorical_vars.nunique().sort_values(ascending=False)[:8]
print(first_7_unique_value_counts)

In [None]:
count_nan_values(df,first_7_unique_value_counts.index.to_list())

# Decisions:

1. **Drop 'Company_Name'** due to very high cardinality.

2. From the columns **'Short Description of company profile'**, **'Industry of company'**, and **'Focus functions of company'**, we will drop the first two because they contain more missing values.

3. **Remove 'Specialization of highest education'** because the information is too scattered and there are many different delimiters, making further examination of the column almost imposible.

4. **For the remaining columns,** we will proceed with analysis as outlined in the cells below.


# This code below works with a DataFrame `df2` and performs several tasks related to an 'Investors' column

1. **Fill Missing Values:** Replaces missing values in the 'Investors' column with the most frequent value (mode).

2. **Get Unique Investors:** Finds all unique investors in the 'Investors' column by splitting values on the '|' character.

3. **Count Investor Occurrences:** Counts how often each unique investor appears in the 'Investors' column and prints the top 5 investors by count.

4. **Create 'Yes' or 'No' Columns:** For a list of target investors, the code creates new columns in `df2` with values 'Yes' if the investor is in the 'Investors' column for a given row, and 'No' otherwise.

5. **Copy Columns to Original DataFrame:** Copies the 'Yes'/'No' columns from `df2` to the original DataFrame `df`.

6. **Show Updated DataFrame:** Displays the first few rows of the updated DataFrame.

**we are working with a copy of the dataframe because the dataframe itself has changed after capitalization**


In [None]:
# Fill missing values in the 'Investors' column with the mode (most frequent value)
df2['Investors'].fillna(df2['Investors'].mode()[0], inplace=True)

# Get a list of all unique investors from the 'Investors' column
investors_list = df2['Investors'].tolist()
unique_investors = set()

# Populate the set of unique investors by splitting the 'Investors' column values by '|'
for investor_entry in investors_list:
    for investor in investor_entry.split('|'):
        unique_investors.add(investor.strip())

# Print the count of unique investors
print(f"Number of unique investors: {len(unique_investors)}")

# Create a dictionary to store the count of each unique investor in the 'Investors' column
investor_count = {}

# Count the occurrences of each unique investor in the 'Investors' column
for investor in unique_investors:
    count = sum(investor in entry for entry in investors_list)
    investor_count[investor] = count

# Sort the dictionary by count in descending order and print the top 40 investors
sorted_investor_count = sorted(investor_count.items(), key=lambda x: x[1], reverse=True)
print("Top 5 investors by count:")
print(sorted_investor_count[:5])

# List of investor names for which we want to create 'Yes' or 'No' columns
target_investors = ['TechStars', 'SV Angel', '500 Startups', 'Sequoia Capital', 'Y Combinator']

# Add columns with default value 'No
for investor in target_investors:
    df2[investor] = 'No'

# Update columns to 'Yes' based on the presence of target investors in the 'Investors' column
for index, row in df2.iterrows():
    for investor in target_investors:
        if investor in row['Investors']:
            df2.at[index, investor] = 'Yes'

# Copy the new columns to df
df[target_investors] = df2[target_investors]

# Display the first few rows of the updated DataFrame
df.head()


In [None]:
summary_statistics(df)

In [None]:
# Fill missing values in the 'Focus functions of company' column with the most frequent value (mode)
df['Focus functions of company'].fillna(df['Focus functions of company'].mode()[0], inplace=True)

# Calculate value counts for the 'Focus functions of company' column
focus_function_counts = df['Focus functions of company'].value_counts()

# Create a dictionary to replace values that occur once or less with 'other'
replace_dict = {function: 'other' for function, count in focus_function_counts.items() if count <= 1}

# Replace values in the 'Focus functions of company' column according to the dictionary
df['Focus functions of company'] = df['Focus functions of company'].replace(replace_dict)

# Print the updated value counts for the 'Focus functions of company' column
print(df['Focus functions of company'].value_counts())


In [None]:
# Split 'last founding date' column into year and month
temp_list = df['Last Funding Date'].str.split('/', expand=True)
# Creating new columns with last funding month and last funding year
df['Last_Funding_month'] = temp_list[0]
df['Last_Funding_year'] = temp_list[2]
df.head()

# Now we will investigate Multicollinearity issues

In [None]:
corr_matrix = df.select_dtypes(include=['int', 'float']).corr()

# Extract the upper triangle of the correlation matrix (excluding the diagonal)
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Sort the correlations in descending order
sorted_corr = upper_triangle.unstack().sort_values(ascending=False)

# Select the top 10 correlations
top_15_corr = sorted_corr.head(15)

print("Top 15 largest correlation coefficients:")
print(top_15_corr)



In [None]:
columns_in_top_15_corr = set(top_15_corr.index.get_level_values(0)).union(set(top_15_corr.index.get_level_values(1)))
columns_in_top_15_corr = list(columns_in_top_15_corr)
print(columns_in_top_15_corr)
count_nan_values(df,columns_in_top_15_corr)

# Here we decided on the following actions:

1. **Remove 'Employees per year of company existence' and 'Last round of funding received (in milionUSD)' due to a large number of missing values.**

2. **Remove 'Time to 1st investment' because the difference in missing values between 'Avg time to investment' and 'Time to 1st investment' is small, and 'Avg time to investment' provides more comprehensive information.**

3. **Remove 'Experience in Fortune 100 organizations' and 'Experience in Fortune 1000 organizations' and leave 'Experience in Fortune 500 organizations'.**

4. **Keep all remaining columns because any correlation observed is likely random.**

**Here below we will visualize all numerical variables using an interactive dashboard with two strategies such as boxplot and histogram**

In [None]:
df_N = df.select_dtypes(include=['int', 'float'])
num_pages = len(df_N.columns)//4+1
strategy_dropdown = widgets.Dropdown(options=['Histogram', 'Box Plot'], description='Strategy:')
page_dropdown = widgets.Dropdown(options=list(range(1, num_pages + 1)), description='Page:')

# Function to update plots based on selected strategy and page
def update_plots(strategy, page):
    start_index = (page - 1) * 4
    end_index = min(page * 4, len(df_N.columns))
    columns_to_display = df_N.columns[start_index:end_index]

    # Split columns for top and bottom plots
    mid = len(columns_to_display) // 2
    top_columns = columns_to_display[:mid]
    bottom_columns = columns_to_display[mid:]

    # Create subplots for top and bottom plots
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
    plt.subplots_adjust(hspace=0.4)

    # Update top plots
    if len(top_columns) == 1:
        plot_axes = axes[0].flatten()
        if strategy == 'Histogram':
            sns.histplot(df_N[top_columns[0]], kde=True, ax=plot_axes[0])
            plot_axes[0].set_title(f'Histogram of {top_columns[0]}')
            plot_axes[0].set_xlabel(top_columns[0])
            plot_axes[0].set_ylabel('Frequency')
        elif strategy == 'Box Plot':
            sns.boxplot(x=top_columns[0], data=df_N, ax=plot_axes[0])
            plot_axes[0].set_title(f'Box Plot of {top_columns[0]}')
            plot_axes[0].set_xlabel(top_columns[0])
            plot_axes[0].set_ylabel('Values')
    else:
        for i, col in enumerate(top_columns):
            plot_axes = axes[0, i]
            if strategy == 'Histogram':
                sns.histplot(df_N[col], kde=True, ax=plot_axes)
                plot_axes.set_title(f'Histogram of {col}')
                plot_axes.set_xlabel(col)
                plot_axes.set_ylabel('Frequency')
            elif strategy == 'Box Plot':
                sns.boxplot(x=col, data=df_N, ax=plot_axes)
                plot_axes.set_title(f'Box Plot of {col}')
                plot_axes.set_xlabel(col)
                plot_axes.set_ylabel('Values')

    # Update bottom plots
    if len(bottom_columns) == 1:
        plot_axes = axes[1].flatten()
        if strategy == 'Histogram':
            sns.histplot(df_N[bottom_columns[0]], kde=True, ax=plot_axes[0])
            plot_axes[0].set_title(f'Histogram of {bottom_columns[0]}')
            plot_axes[0].set_xlabel(bottom_columns[0])
            plot_axes[0].set_ylabel('Frequency')
        elif strategy == 'Box Plot':
            sns.boxplot(x=bottom_columns[0], data=df_N, ax=plot_axes[0])
            plot_axes[0].set_title(f'Box Plot of {bottom_columns[0]}')
            plot_axes[0].set_xlabel(bottom_columns[0])
            plot_axes[0].set_ylabel('Values')
    else:
        for i, col in enumerate(bottom_columns):
            plot_axes = axes[1, i]
            if strategy == 'Histogram':
                sns.histplot(df_N[col], kde=True, ax=plot_axes)
                plot_axes.set_title(f'Histogram of {col}')
                plot_axes.set_xlabel(col)
                plot_axes.set_ylabel('Frequency')
            elif strategy == 'Box Plot':
                sns.boxplot(x=col, data=df_N, ax=plot_axes)
                plot_axes.set_title(f'Box Plot of {col}')
                plot_axes.set_xlabel(col)
                plot_axes.set_ylabel('Values')

    plt.show()

# Interactivity
widgets.interactive(update_plots, strategy=strategy_dropdown, page=page_dropdown)

In [None]:
df['Renowned in professional circle'] = np.where(df['Renowned in professional circle'] > 450, 'great than 450', 'less than 450')
df['Percent_skill_Finance'] = np.where(df['Percent_skill_Finance'] < 15, 'less than 15', 'great than 15')
df['Percent_skill_Investment'] = np.where(df['Percent_skill_Investment'] < 15, 'less than 15', 'great than 15')
df['Percent_skill_Law'] = np.where(df['Percent_skill_Law'] < 7, 'less than 7', 'great than 7')
df['Percent_skill_Consulting'] = np.where(df['Percent_skill_Consulting'] < 6, 'less than 6', 'great than 6')
df['Number of Investors in Angel and or VC'] = np.where(df['Number of Investors in Angel and or VC'] <= 1.5, 'less than 1.5', 'great than 1.5')

In [None]:
df_N=df.select_dtypes(include=['int', 'float'])
for feature in df_N.columns:
        # Apply log transformation to the feature
        df[feature] = np.log1p(df[feature])

**we see that there are a lot of columns with significantly many outlayers, so we will apply a logarithmic transformation, but even after this there are columns with outlayers, so before the transformation we will change the values ​​of these columns to "less then threshold" and "great then threshold" to solve this problem**

In [None]:
widgets.interactive(update_plots, strategy=strategy_dropdown, page=page_dropdown)

In [None]:
#"Run the droping function to remove the unnecessary columns identified above."
droping(df)

In [None]:

def replace_nan_with_median(df):
    """
    Replace NaN values in numeric columns with the median value of each column.
    """
    # Copy the DataFrame to avoid modifying the original DataFrame
    df_filled = df.copy()
    
    # Get numeric columns
    numeric_columns = df_filled.select_dtypes(include=['int', 'float']).columns
    
    # Replace NaN values with column-wise median
    for col in numeric_columns:
        median_value = df_filled[col].median()
        df_filled[col].fillna(median_value, inplace=True)
    
    return df_filled
df=replace_nan_with_median(df)

In [None]:
def replace_nan_with_mode(df):
    """
    Replace NaN values in object columns with the mode (most frequent value) of each column.
    """
    # Copy the DataFrame to avoid modifying the original DataFrame
    df_filled = df.copy()
    
    # Get object columns
    object_columns = df_filled.select_dtypes(include=['object']).columns
    
    # Replace NaN values with column-wise mode
    for col in object_columns:
        mode_value = df_filled[col].mode()[0]  # Use [0] to get the first mode if multiple modes exist
        df_filled[col].fillna(mode_value, inplace=True)
    
    return df_filled
df=replace_nan_with_mode(df)

In [None]:
custom_info(df)

In [None]:
df.head()

In [None]:
# Handling infinite values, impute missing values, and encode categorical data.

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

# Replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Select numeric columns
numeric_columns = df.select_dtypes(include=['int', 'float']).columns

# Impute missing values in numeric columns using the median strategy
imputer = SimpleImputer(strategy='median')
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

# Encode categorical columns using ordinal encoding
encoder = OrdinalEncoder()
df_encoded = encoder.fit_transform(df)
df_encoded = pd.DataFrame(df_encoded, columns=df.columns)

# Display the first 10 rows of the encoded DataFrame
df_encoded.head(10)

In [None]:
# Split the data

X = df_encoded.drop('Dependent-Company Status', axis = 1) 

y = df_encoded['Dependent-Company Status'] 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
# lets print the shapes
print("Shape of the X Train :", X_train.shape)
print("Shape of the y Train :", y_train.shape)
print("Shape of the X test :", X_test.shape)
print("Shape of the y test :", y_test.shape)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score,roc_curve, auc, precision_recall_curve, f1_score

In [None]:
# pip install xgboost

In [None]:
from xgboost import XGBClassifier

#train
xgb = XGBClassifier()

xgb.fit(X_train,y_train)

#predict
y_predicted_xgb = xgb.predict(X_test)

print("Training Accuracy :", xgb.score(X_train, y_train))
print("Testing Accuracy :", xgb.score(X_test, y_test))

#eval
cm = confusion_matrix(y_test, y_predicted_xgb)
plt.rcParams['figure.figsize'] = (3, 3)
sns.heatmap(cm, annot = True, cmap = 'YlGnBu', fmt = '.8g')
plt.show()

cr = classification_report(y_test, y_predicted_xgb)
print(cr)

print("------------------------------------------")

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y_predicted_xgb)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("ROC Curves              =",roc_auc)

precision, recall, thresholds = precision_recall_curve(y_test, y_predicted_xgb)
f1 = f1_score(y_test, y_predicted_xgb)
Precision_Recall_xgb = auc(recall, precision)
print("Precision-Recall Curves =",Precision_Recall_xgb)


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
#train
gbc = GradientBoostingClassifier(learning_rate=0.02,
                    max_depth=4,
                    random_state=100, n_estimators=1000)


gbc.fit(X_train,y_train)

#predict
y_predicted_gb = gbc.predict(X_test)

print("Training Accuracy :", gbc.score(X_train, y_train))
print("Testing Accuracy :", gbc.score(X_test, y_test))

#eval
cm = confusion_matrix(y_test, y_predicted_gb)
plt.rcParams['figure.figsize'] = (3, 3)
sns.heatmap(cm, annot = True, cmap = 'YlGnBu', fmt = '.8g')
plt.show()

cr = classification_report(y_test, y_predicted_gb)
print(cr)


print("------------------------------------------")

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y_predicted_gb)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("ROC Curves              =",roc_auc)

precision, recall, thresholds = precision_recall_curve(y_test, y_predicted_gb)
f1 = f1_score(y_test, y_predicted_gb)
Precision_Recall_gbs = auc(recall, precision)
print("Precision-Recall Curves =",Precision_Recall_gbs)



In [None]:
from sklearn.ensemble import AdaBoostClassifier
#train
ada = AdaBoostClassifier()


ada.fit(X_train,y_train)

#predict
y_predicted_ab = ada.predict(X_test)

print("Training Accuracy :", ada.score(X_train, y_train))
print("Testing Accuracy :", ada.score(X_test, y_test))

#eval
cm = confusion_matrix(y_test, y_predicted_ab)
plt.rcParams['figure.figsize'] = (3, 3)
sns.heatmap(cm, annot = True, cmap = 'YlGnBu', fmt = '.8g')
plt.show()

cr = classification_report(y_test, y_predicted_ab)
print(cr)

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y_predicted_ab)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("roc_auc",roc_auc)

print("------------------------------------------")

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y_predicted_ab)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("ROC Curves              =",roc_auc)

precision, recall, thresholds = precision_recall_curve(y_test, y_predicted_ab)
f1 = f1_score(y_test, y_predicted_ab)
Precision_Recall_abs = auc(recall, precision)
print("Precision-Recall Curves =",Precision_Recall_abs)


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train,y_train)


y_pred_rf = rf.predict(X_test)

print("Training Accuracy :", rf.score(X_train, y_train))
print("Testing Accuracy :", rf.score(X_test, y_test))

cm = confusion_matrix(y_test, y_pred_rf)
plt.rcParams['figure.figsize'] = (3, 3)
sns.heatmap(cm, annot = True, cmap = 'YlGnBu', fmt = '.8g')
plt.show()

cr = classification_report(y_test, y_pred_rf)
print(cr)


print("------------------------------------------")

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y_pred_rf)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("ROC Curves              =",roc_auc)

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)
Precision_Recall_rfs = auc(recall, precision)
print("Precision-Recall Curves =",Precision_Recall_rfs)

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

scores = {        
                 'GradientBoosting Classifier':  { 
                             'precision_score': precision_score(y_test, y_predicted_gb),
                             'recall_score': recall_score(y_test, y_predicted_gb)
                         },
                 'Adaboost Classifier':  { 
                             'precision_score': precision_score(y_test, y_predicted_ab),
                             'recall_score': recall_score(y_test, y_predicted_ab)
                         },
                 'XGBoost':  { 
                             'precision_score': precision_score(y_test, y_predicted_xgb),
                             'recall_score': recall_score(y_test, y_predicted_xgb)
                         },
                 'Random Forest':  { 
                             'precision_score': precision_score(y_test, y_pred_rf),
                            'recall_score': recall_score(y_test, y_pred_rf)
                         }
            }

In [None]:
from sklearn.metrics import precision_score


Precision_Recall = {
    
                 'GradientBoosting Classifier':  { 
                             'Precision_Recall': Precision_Recall_gbs
                         },
                 'Adaboost Classifier':  { 
                             'Precision_Recall': Precision_Recall_abs
                         },
                 'XGBoost':  { 
                             'Precision_Recall': Precision_Recall_xgb
                         },
                 'Random Forest':  { 
                             'Precision_Recall': Precision_Recall_rfs
                         }
            }

In [None]:
scores = pd.DataFrame(scores)


scores.plot(kind="barh",figsize=(12, 12)).legend(loc='upper center', ncol=3, title="Machine Learning Model")



In [None]:
Precision_Recall = pd.DataFrame(Precision_Recall)


Precision_Recall.plot(kind="barh",figsize=(15, 8)).legend(loc='upper center', ncol=3, title="Machine Learning Model")


In [None]:
[1,2,3]