In [2]:
import sklearn
import seaborn as sns
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
URL_TRAIN_CSV = "https://gitlab.com/villainnumber1/filesml_proyecto_2/-/raw/main/train.csv"
URL_TEST_CSV = "https://gitlab.com/villainnumber1/filesml_proyecto_2/-/raw/main/test.csv"


X = pd.read_csv(URL_TRAIN_CSV, sep=",")
Y = pd.read_csv(URL_TEST_CSV, sep=",")

X

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,43161,3985,4,Lawrencez,40.0,856431180.0,Accountant,97384.44,8167.370000,1.0,...,Good,405.45,32.643281,336.0,No,77.284943,53.714219,High_spent_Large_value_payments,847.172909,Standard
1,40663,49494,2,Mohammedr,39.0,417453942.0,Writer,31369.58,2607.131667,8.0,...,Bad,4259.91,39.375507,144.0,Yes,171.748418,44.990811,High_spent_Large_value_payments,280.855383,Standard
2,111059,2925,6,Alistairn,17.0,499528888.0,Writer,30171.86,2772.321667,6.0,...,Bad,3406.51,34.039021,123.0,Yes,205.403926,40.781489,High_spent_Large_value_payments,247.322615,Standard
3,72139,21358,2,Dinesh Naire,22.0,859449371.0,Doctor,13100.02,1246.668333,7.0,...,Standard,727.29,34.747262,129.0,Yes,41.177132,23.005463,Low_spent_Large_value_payments,298.156832,Standard
4,119841,21221,4,Bakerg,26.0,485239890.0,Doctor,62313.54,5120.795000,6.0,...,Standard,817.64,24.545394,122.0,NM,196.679642,63.346750,Low_spent_Medium_value_payments,152.669902,Standard
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79995,42636,39869,7,Hendrik Sackmannb,50.0,217529489.0,Media_Manager,107761.29,9047.107500,4.0,...,Standard,34.09,41.899992,211.0,Yes,0.000000,150.220682,High_spent_Large_value_payments,984.123010,Standard
79996,114031,38305,2,Alexj,33.0,98861902.0,Scientist,16218.32,1098.526667,6.0,...,Bad,4466.33,24.275710,98.0,NM,64.533114,34.457893,High_spent_Small_value_payments,260.523798,Poor
79997,85991,9976,6,Luciana Lopezi,25.0,745835485.0,Engineer,29374.77,2377.897500,3.0,...,Standard,1189.37,38.439021,241.0,NM,40.264074,22.234178,High_spent_Medium_value_payments,403.980355,Standard
79998,21240,13193,7,Edward Krudyd,15.0,894419091.0,Manager,43782.45,3543.537500,6.0,...,Bad,3468.25,24.233841,85.0,Yes,240.664739,42.839562,Low_spent_Large_value_payments,146.741586,Poor


In [4]:
transformersX = [
  ("ID",                                              "drop",                     [0]),
  ("Customer_ID",                                     "drop",                     [1]),
  # Puede que el mes influya en el credito del cliente?
  ("Month",                                           "drop",                     [2]),
  ("Name",                                            "drop",                     [3]),
  ("Age",                                             "passthrough",              [4]),
  ("SSN",                                             "drop",                     [5]),
  ("Occupation",                                      "drop",                     [6]),
  ("Annual_Income",                                   "passthrough",              [7]),
  ("Monthly_Inhand_Salary",                           "passthrough",              [8]),
  ("Num_Bank_Accounts",                               "passthrough",              [9]),
  ("Num_Credit_Card",                                 "passthrough",              [10]),
  ("Interest_Rate",                                   "passthrough",              [11]),
  ("Num_of_Loan",                                     "passthrough",              [12]),
  ("Type_of_Loan",                                    "drop",              [13]),
  ("Delay_from_due_date",                             "drop",              [14]),
  ("Num_of_Delayed_Payments",                         "drop",              [15]),
  ("Changed_Credit_Limit",                            "drop",                     [16]),
  ("Num_Credit_Inquiries",                            "drop",              [17]),
  ("Credit_Mix",                                      "drop",                     [18]),
  ("Outstanding_Debt",                                "drop",                     [19]),
  ("Credit_Utilization_Ratio",                        "drop",                     [20]),
  ("Credit_History_Age",                              "drop",                     [21]),
  ("Payment_of_Min_Amount",                           "drop",                     [22]),
  ("Total_EMI_per_month",                             "drop",                     [23]),
  ("Amount_invested_monthly",                         "drop",                     [24]),
  ("Payment_Behaviour",                               "drop",                     [25]),
  ("Monthly_Balance",                                 "drop",                     [26]),
  ("Credit_Score",                                    "drop",                     [27]),
]

transformersY = deepcopy(transformersX)

# Normbrar el orden de columnas de 0 a n-1
transformersY.pop(27) # Eliminar precio
for i, t in enumerate(transformersY): # Renumerar los indices de las columnas
  transformersY[i][2].pop()
  transformersY[i][2].append(i)

X_T = ColumnTransformer(transformers=transformersX).fit_transform(X)
Y_T = ColumnTransformer(transformers=transformersY).fit_transform(Y)

In [6]:
from sklearn.metrics import f1_score

clf = RandomForestClassifier(n_estimators=100, max_depth = 10, random_state=42)

clf.fit(X_T, X["Credit_Score"])

pred = clf.predict(Y_T)


In [7]:
parameters = {
  
  'n_estimators': range(5,51,10),
  'max_depth': range(1,30,5),
  
}

clf = RandomForestClassifier(n_estimators=100, max_depth = 10, random_state=42)

gs = GridSearchCV(clf, parameters, verbose=1, n_jobs=-1, cv=5, scoring = "f1_micro")

gs.fit(X_T, X["Credit_Score"])

pd.DataFrame(gs.cv_results_).sort_values('rank_test_score')






Fitting 5 folds for each of 30 candidates, totalling 150 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
29,6.840447,0.943701,0.462403,0.066363,26,45,"{'max_depth': 26, 'n_estimators': 45}",0.806,0.799875,0.79875,0.800125,0.798562,0.800662,0.002737,1
28,5.100698,0.06334,0.42014,0.010936,26,35,"{'max_depth': 26, 'n_estimators': 35}",0.804937,0.800375,0.798063,0.800375,0.799438,0.800638,0.002311,2
27,3.711381,0.045046,0.346922,0.003268,26,25,"{'max_depth': 26, 'n_estimators': 25}",0.804562,0.799188,0.79775,0.802063,0.797813,0.800275,0.002653,3
26,2.285218,0.038575,0.285669,0.010254,26,15,"{'max_depth': 26, 'n_estimators': 15}",0.803312,0.796875,0.797813,0.799438,0.795,0.798488,0.002807,4
24,6.796962,0.750485,0.473696,0.040373,21,45,"{'max_depth': 21, 'n_estimators': 45}",0.798438,0.791562,0.794438,0.796937,0.794813,0.795237,0.002343,5
22,3.561122,0.026551,0.338047,0.006875,21,25,"{'max_depth': 21, 'n_estimators': 25}",0.798562,0.7875,0.793813,0.796625,0.792687,0.793838,0.003784,6
23,4.865917,0.062389,0.390356,0.004488,21,35,"{'max_depth': 21, 'n_estimators': 35}",0.797063,0.788125,0.793062,0.796188,0.7925,0.793387,0.00316,7
21,2.17293,0.025095,0.274379,0.007195,21,15,"{'max_depth': 21, 'n_estimators': 15}",0.795438,0.787062,0.790813,0.794125,0.791438,0.791775,0.002905,8
25,0.88455,0.015014,0.217122,0.001385,26,5,"{'max_depth': 26, 'n_estimators': 5}",0.796375,0.79,0.79175,0.790937,0.788937,0.7916,0.002566,9
20,0.840041,0.013698,0.222849,0.00848,21,5,"{'max_depth': 21, 'n_estimators': 5}",0.786687,0.77575,0.780125,0.784,0.781875,0.781688,0.00369,10


In [None]:
solucion = pd.DataFrame(data = {
        "ID" : Y["ID"],
        "Credit_Score": pred
    })

solucion

In [None]:
solucion.to_csv("solucion.csv", index = False)