In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, ShuffleSplit
from sklearn.preprocessing import *
from sklearn.decomposition import PCA
from sklearn.linear_model import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.datasets import make_regression
from sklearn.metrics import *
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from xgboost import XGBClassifier, XGBRegressor
from imblearn.datasets import make_imbalance
from category_encoders.target_encoder import TargetEncoder
import statsmodels.api as sm

In [2]:
df = pd.read_csv('df_pipe')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,volume,price_bin,price
0,0,0.23,Ideal,E,SI2,61.5,55.0,38.20203,0.0,326.0
1,1,0.21,Premium,E,SI1,59.8,61.0,34.505856,0.0,326.0
2,2,0.23,Good,E,VS1,56.9,65.0,38.076885,0.0,327.0
3,3,0.29,Premium,I,VS2,62.4,58.0,46.72458,0.0,334.0
4,4,0.31,Good,J,SI2,63.3,58.0,51.91725,0.0,335.0


In [4]:
def target_encoding(data, column, target):
    
    grouped = data[[column,target]].groupby(column,as_index=False).mean()
    empty_dict = {}
    for i in range(len(grouped)):
        empty_dict[grouped.iloc[i,0]]=grouped.iloc[i,1]
    data[column]=data[column].map(lambda x: empty_dict[x])
    
    return data

In [6]:
ma = MaxAbsScaler()
ss = StandardScaler()

In [7]:
df_num = df.select_dtypes(exclude='O').drop(['price','price_bin'],axis=1)
df_cat = df.select_dtypes(include='O')
df_tar = df[['price','price_bin']]

In [8]:
df_num=ma.fit_transform(df_num)
df=pd.concat([pd.DataFrame(df_num,columns=df.select_dtypes(exclude='O').columns),df_cat,df_tar],axis=1)

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,carat,depth,table,volume,price_bin,price,cut,color,clarity
0,0.0,0.093878,0.922039,0.846154,0.094664,0.0,0.017319,Ideal,E,SI2
1,1.9e-05,0.085714,0.896552,0.938462,0.085505,0.0,0.017319,Premium,E,SI1
2,3.8e-05,0.093878,0.853073,1.0,0.094354,0.0,0.017372,Good,E,VS1
3,5.6e-05,0.118367,0.935532,0.892308,0.115783,0.0,0.017744,Premium,I,VS2
4,7.5e-05,0.126531,0.949025,0.892308,0.12865,0.0,0.017797,Good,J,SI2


In [None]:
target_encoding(df,'color','price')

In [None]:
target_encoding(df,'cut','price')

In [15]:
target_encoding(df,'clarity','price')

Unnamed: 0.1,Unnamed: 0,carat,depth,table,volume,price_bin,price,cut,color,clarity
0,0.000000,0.093878,0.922039,0.846154,0.094664,0.0,0.017319,0.182666,0.163039,0.262518
1,0.000019,0.085714,0.896552,0.938462,0.085505,0.0,0.017319,0.241008,0.163039,0.211442
2,0.000038,0.093878,0.853073,1.000000,0.094354,0.0,0.017372,0.206982,0.163039,0.203828
3,0.000056,0.118367,0.935532,0.892308,0.115783,0.0,0.017744,0.241008,0.266214,0.208033
4,0.000075,0.126531,0.949025,0.892308,0.128650,0.0,0.017797,0.206982,0.273955,0.262518
...,...,...,...,...,...,...,...,...,...,...
53245,0.999925,0.293878,0.911544,0.876923,0.287247,0.0,0.146470,0.182666,0.167714,0.211442
53246,0.999944,0.293878,0.946027,0.846154,0.292675,0.0,0.146470,0.206982,0.167714,0.211442
53247,0.999962,0.285714,0.941529,0.923077,0.283604,0.0,0.146470,0.210190,0.167714,0.211442
53248,0.999981,0.351020,0.914543,0.892308,0.348815,0.0,0.146470,0.241008,0.235409,0.262518


In [5]:
for col in df.select_dtypes(include='O').columns:
    df[col]=target_encoding(df,col,'price')

In [4]:
X=df.drop(['price','price_bin'],axis=1)
y1=df.price_bin
y2=df.price

In [5]:
logreg=LogisticRegression(random_state=14)
cross_val_score(logreg, X.select_dtypes(exclude='O'), y1, cv=5, scoring='accuracy').mean()

0.7439624413145539

In [6]:
lab1 = LabelEncoder()
lab2 = LabelEncoder()
lab3 = LabelEncoder()

In [7]:
tab1 = TargetEncoder()
tab2 = TargetEncoder()
tab2 = TargetEncoder()

In [8]:
column_trans = make_column_transformer(
    (OneHotEncoder(), ['cut', 'color','clarity']))

In [9]:
pipe = make_pipeline(column_trans, logreg)

In [10]:
cross_val_score(pipe, X, y1, cv=5, scoring='accuracy').mean()

0.7250892018779342

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y1)

In [12]:
def upsample_classes(data, target):
    
    labels = list(data[target].unique())
    i = 0
    lst = []
    while len(lst) < 3:
        lst.append(labels[i])
        i+=1
    
    classes = []
    for c in lst:
        classes.append(data[data[target]==c])
    
    length = 0
    class_lab = None
    for c in classes:
        if len(c)>length:
            length=len(c)
            class_lab = c
    class_lab = class_lab[target].unique()[0]
    
    a = pd.concat(classes)
    maj_class = a[a[target]==class_lab]

    lst.remove(class_lab)
    
    new_classes=[]
    for i in lst:
        new_classes.append(resample(data[data[target]==i],replace=True, n_samples=len(maj_class)))

    b = pd.concat(new_classes)
    upsample = pd.concat([a[a[target]==class_lab],b])

    return upsample

In [13]:
upsample_train = upsample_classes(pd.concat([X_train,y_train],axis=1),'price_bin')

In [14]:
X_train=upsample_train.drop('price_bin',axis=1)
y_train=upsample_train.price_bin

In [15]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['cut', 'color',
                                                   'clarity'])])),
                ('logisticregression', LogisticRegression(random_state=14))])

In [16]:
p=pipe.predict(X_test)

In [17]:
print(classification_report(p,y_test))

              precision    recall  f1-score   support

         0.0       0.51      0.82      0.63      6021
         1.0       0.41      0.24      0.30      3905
         2.0       0.00      0.00      0.00         0
         3.0       0.44      0.06      0.11      3387

    accuracy                           0.46     13313
   macro avg       0.34      0.28      0.26     13313
weighted avg       0.46      0.46      0.40     13313



In [21]:
pipe = Pipeline([('scl', MinMaxScaler()),
                ('pca', PCA(n_components=10)),
                ('svm', SVC(random_state=123))])

grid = [{'svm__kernel': ['poly', 'sigmoid'],
         'svm__C': [0.01, 1, 100],
         'svm__degree': [2,3,4,5],
         'svm__gamma': [0.001, 0.01]}]

gridsearch = GridSearchCV(estimator=pipe,
                  param_grid=grid,
                  scoring='accuracy',
                  cv=3)

gridsearch.fit(X_train, y_train)

ValueError: could not convert string to float: 'Ideal'