# EXPLORATORY DATA ANALYSIS

#### IMPORT DES DONNEES

In [1]:
import pandas as pd

df = pd.read_csv("../SBAnational.csv", low_memory=False)
data = df.copy()
data = data.drop(['ChgOffDate', 'DisbursementDate', 'DisbursementGross','BalanceGross', 'ChgOffPrinGr'], axis=1)
data.head(3)

Unnamed: 0,LoanNr_ChkDgt,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,...,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv,SBA_Appv
0,1000014003,ABC HOBBYCRAFT,EVANSVILLE,IN,47711,FIFTH THIRD BANK,OH,451120,28-Feb-97,1997,...,2.0,0,0,1,0,N,Y,P I F,"$60,000.00","$48,000.00"
1,1000024006,LANDMARK BAR & GRILLE (THE),NEW PARIS,IN,46526,1ST SOURCE BANK,IN,722410,28-Feb-97,1997,...,2.0,0,0,1,0,N,Y,P I F,"$40,000.00","$32,000.00"
2,1000034009,"WHITLOCK DDS, TODD M.",BLOOMINGTON,IN,47401,GRANT COUNTY STATE BANK,IN,621210,28-Feb-97,1997,...,1.0,0,0,1,0,N,N,P I F,"$287,000.00","$215,250.00"


In [2]:
data_secteurs = pd.read_csv("../csv_secteursactivite.csv")
data_secteurs = data_secteurs.drop("Default rate (%)", axis=1)

# Modification de certaines valeurs 
mapping = {"31–33": "31", "44–45": "44", "48–49": "48"}
data_secteurs['2 digit code'] = data_secteurs['2 digit code'].replace(mapping)

# Création et Ajout de nouvelles lignes
nouvelles_lignes = [{'2 digit code': 32, 'Description': 'Manufacturing'},
                    {'2 digit code': 33, 'Description': 'Manufacturing'},
                    {'2 digit code': 45, 'Description': 'Retail trade'},
                    {'2 digit code': 49, 'Description': 'Transportation and warehousing'}]

# data_secteurs = data_secteurs.append(nouvelles_lignes, ignore_index=True)

# Concaténation
nouvelles_lignes_df = pd.DataFrame(nouvelles_lignes)
data_secteurs = pd.concat([data_secteurs, nouvelles_lignes_df], ignore_index=True)

# Conversion de la variable en int
data_secteurs["NAICS_id"] = data_secteurs["2 digit code"].astype(int)
data_secteurs = data_secteurs.drop("2 digit code", axis=1)
data_secteurs.head(5)

Unnamed: 0,Description,NAICS_id
0,"Mining, quarrying, and oil and gas extraction",21
1,"Agriculture, forestry, fishing and hunting",11
2,Management of companies and enterprises,55
3,Health care and social assistance,62
4,Utilities,22


### SWEETVIZ

In [3]:
# import sweetviz as sv
# df_test['ApprovalFY'] = pd.to_numeric(df_test['ApprovalFY'], errors='coerce')
# my_report = sv.analyze(df_test)
# my_report.show_html() # Default arguments will generate to "SWEETVIZ_REPORT.html"

### YDATA

In [4]:
# from ydata_profiling import ProfileReport
# profile = ProfileReport(df, title="Profiling Report")
# profile

### NETTOYAGE DES DONNEES

In [5]:
# CONVERSION DES VARIABLES GrAppv ET SBA_Appv EN FORMAT FLOAT
def amount_to_float(s: str) -> float:
    """Converts a 'string' amount in $ to its float value"""
    return float(s[1:].replace(',', ''))

data["GrAppv"] = data["GrAppv"].apply(amount_to_float)
data["SBA_Appv"] = data["SBA_Appv"].apply(amount_to_float)

In [6]:
# CONVERSION DE LA VARIABLE ApprovalFY EN FORMAT INTEGER
def str_to_int(s: str) -> int:
    """Converts a 'string' digit into an integer and deletes string characters if necessary"""
    s = ''.join(char for char in str(s) if char.isdigit())
    return int(s) if s else None

data["ApprovalFY"] = data["ApprovalFY"].apply(str_to_int)

In [7]:
# CONVERSION DE LA VARIABLE ApprovalDate EN FORMAT DATETIME
data['ApprovalDate'] = pd.to_datetime(data['ApprovalDate'], errors='coerce')

In [8]:
# CREATION D'UNE AUTRE COLONNE NAICS 
data['NAICS_id'] = data['NAICS'].copy()
data['NAICS_id'] = data['NAICS_id'].astype(str).str[:2]
data['NAICS_id'] = data['NAICS_id'].astype(int)

# MERGE AVEC LE DATASET DES SECTEURS D'ACTIVITE
dataset = pd.merge(data, data_secteurs, on='NAICS_id', how='outer')

# SUPPRESSION DE LA COLONNE
dataset = dataset.drop('NAICS_id', axis=1).head(1)
dataset.head(1)

Unnamed: 0,LoanNr_ChkDgt,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,...,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv,SBA_Appv,Description
0,1000014003,ABC HOBBYCRAFT,EVANSVILLE,IN,47711,FIFTH THIRD BANK,OH,451120,1997-02-28,1997,...,0,0,1,0,N,Y,P I F,60000.0,48000.0,Retail trade


In [9]:
dataset.dtypes 

LoanNr_ChkDgt             int64
Name                     object
City                     object
State                    object
Zip                       int64
Bank                     object
BankState                object
NAICS                     int64
ApprovalDate     datetime64[ns]
ApprovalFY                int64
Term                      int64
NoEmp                     int64
NewExist                float64
CreateJob                 int64
RetainedJob               int64
FranchiseCode             int64
UrbanRural                int64
RevLineCr                object
LowDoc                   object
MIS_Status               object
GrAppv                  float64
SBA_Appv                float64
Description              object
dtype: object

# MACHINE LEARNING

### PYCARET

In [10]:
# from pycaret.classification import *
# df_test = data.copy()
# df_test = df_test.dropna()
# session = setup(data=df_test, target='MIS_Status', normalize=True, train_size=0.8, data_split_stratify=True, fold=5, session_id=0)

In [11]:
# top_models = compare_models(sort="Accuracy")
# print(top_models)