# Competition Machine Learning

## Importación de librerías

In [247]:
%matplotlib inline

import pandas as pd

import numpy as np
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVR


from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt


from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV

## Importación de dataset

In [248]:
df = pd.read_csv("Inputs/diamonds_train.csv")

In [249]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.78,Premium,F,VS1,61.5,58.0,5.93,5.98,3.66,3446
1,1,0.31,Ideal,D,SI1,60.8,56.0,4.37,4.32,2.64,732
2,2,0.3,Ideal,F,SI1,62.3,54.0,4.3,4.34,2.69,475
3,3,1.04,Ideal,E,VVS2,62.0,58.0,6.54,6.46,4.03,9552
4,4,0.65,Ideal,J,SI1,61.4,55.0,5.58,5.62,3.44,1276


## Exploración de datos

**Nivel General**

In [250]:
df.isnull().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [251]:
df.corr()

Unnamed: 0,id,carat,depth,table,x,y,z,price
id,1.0,0.001804,-0.003035,0.004436,0.003238,0.002021,0.002507,0.001823
carat,0.001804,1.0,0.023118,0.181725,0.976267,0.945757,0.968685,0.922345
depth,-0.003035,0.023118,1.0,-0.299534,-0.028765,-0.032894,0.092482,-0.013307
table,0.004436,0.181725,-0.299534,1.0,0.195775,0.182559,0.154399,0.126545
x,0.003238,0.976267,-0.028765,0.195775,1.0,0.967143,0.985385,0.886168
y,0.002021,0.945757,-0.032894,0.182559,0.967143,1.0,0.96035,0.860499
z,0.002507,0.968685,0.092482,0.154399,0.985385,0.96035,1.0,0.876061
price,0.001823,0.922345,-0.013307,0.126545,0.886168,0.860499,0.876061,1.0


**Carat = Quilates**

In [252]:
df.loc[[df['carat'].idxmax()]]

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
12413,12413,5.01,Fair,J,I1,65.5,59.0,10.74,10.54,6.98,18018


In [253]:
df.loc[[df['carat'].idxmin()]]

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
2955,2955,0.2,Premium,F,VS2,62.6,59.0,3.73,3.71,2.33,367


In [254]:
df["carat"].corr(df["price"])

0.9223452134291695

Los quilates representan una feature clave respecto al precio.

**Análisis de categóricas**

In [255]:
df["cut"].value_counts()

Ideal        16141
Premium      10303
Very Good     9068
Good          3631
Fair          1202
Name: cut, dtype: int64

In [256]:
df["color"].value_counts()

G    8462
E    7292
F    7133
H    6200
D    5094
I    4094
J    2070
Name: color, dtype: int64

## Borrado de columnas

In [257]:
df.drop(columns=["table","depth","y","z"], axis=1, inplace=True)

In [258]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,x,price
0,0,0.78,Premium,F,VS1,5.93,3446
1,1,0.31,Ideal,D,SI1,4.37,732
2,2,0.3,Ideal,F,SI1,4.3,475
3,3,1.04,Ideal,E,VVS2,6.54,9552
4,4,0.65,Ideal,J,SI1,5.58,1276


In [259]:
df["cut"].unique()

array(['Premium', 'Ideal', 'Very Good', 'Fair', 'Good'], dtype=object)

In [260]:
df = df.replace("Premium", 1).replace("Ideal", 2).replace("Very Good", 3).replace("Very Good", 4).replace("Fair", 5).replace("Good", 6)

df.head()

Unnamed: 0,id,carat,cut,color,clarity,x,price
0,0,0.78,1,F,VS1,5.93,3446
1,1,0.31,2,D,SI1,4.37,732
2,2,0.3,2,F,SI1,4.3,475
3,3,1.04,2,E,VVS2,6.54,9552
4,4,0.65,2,J,SI1,5.58,1276


In [261]:
df["color"].unique()

array(['F', 'D', 'E', 'J', 'H', 'I', 'G'], dtype=object)

In [262]:
df = df.replace("D", 1).replace("E", 2).replace("F", 3).replace("G", 4).replace("H", 5).replace("I", 6).replace("J", 7)

In [263]:
df = df.replace("IF", 1).replace("VVS1", 2).replace("VVS2", 3).replace("VS1", 4).replace("VS2", 5).replace("SI1", 6).replace("SI2", 7).replace("I1", 8)

df.head()


Unnamed: 0,id,carat,cut,color,clarity,x,price
0,0,0.78,1,3,4,5.93,3446
1,1,0.31,2,1,6,4.37,732
2,2,0.3,2,3,6,4.3,475
3,3,1.04,2,2,3,6.54,9552
4,4,0.65,2,7,6,5.58,1276


In [281]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,x,price
0,0,0.78,1,3,4,5.93,3446
1,1,0.31,2,1,6,4.37,732
2,2,0.3,2,3,6,4.3,475
3,3,1.04,2,2,3,6.54,9552
4,4,0.65,2,7,6,5.58,1276


In [283]:
df.dtypes

id           int64
carat      float64
cut          int64
color        int64
clarity      int64
x          float64
price        int64
dtype: object

## Pipeline

In [265]:
# pipeline = [
#     StandardScaler(),
#     Normalizer(),
# ]

In [266]:
# tr = make_pipeline(*pipeline)

In [267]:
# Xpr = tr.fit_transform(df)
# Xpr = pd.DataFrame(Xpr,columns=df.columns)

In [268]:
# Xpr.head()

## Aplicar modelo

In [282]:

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["price","id"],axis=1), df["price"], test_size=0.2)

In [270]:
# models = {
#     "svm-linear": CalibratedClassifierCV(LinearSVC(),cv=3),
#     "svm-rbf": CalibratedClassifierCV(SVC(kernel="sigmoid",gamma="auto"),cv=3),
#     "randomforest": RandomForestClassifier(n_estimators=100)
# }

# for name,m  in models.items():
#     print(f"Training {name}...")
#     m.fit(X_train, y_train)
# print("Train complete")

In [271]:
# model = SVR(kernel="rbf")
# model.fit(X_train, y_train)

In [272]:
# y_pred = model.predict(X_test)

In [273]:
# print("MSE", mean_squared_error(y_test,y_pred))
# print("RMSE", np.sqrt(mean_squared_error(y_test,y_pred)))

In [274]:
# y_pred

In [289]:
model = RandomForestClassifier(n_estimators=40, max_depth=40, min_samples_leaf=20)


In [290]:
model.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=40, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=20, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=40,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [291]:
y_pred = model.predict(X_test)

In [288]:
print("MSE", mean_squared_error(y_test,y_pred))
print("RMSE", np.sqrt(mean_squared_error(y_test,y_pred)))

MSE 655019.5892923535
RMSE 809.3328050266797
