In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, ShuffleSplit
from sklearn.preprocessing import *
from sklearn.decomposition import PCA
from sklearn.linear_model import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.datasets import make_regression
from sklearn.metrics import *
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from xgboost import XGBClassifier, XGBRegressor
from imblearn.datasets import make_imbalance
from category_encoders.target_encoder import TargetEncoder
import statsmodels.api as sm
import datetime as dt

In [2]:
def upsample_classes(data, target):
    
    labels = list(data[target].unique())
    i = 0
    lst = []
    while len(lst) < 3:
        lst.append(labels[i])
        i+=1
    
    classes = []
    for c in lst:
        classes.append(data[data[target]==c])
    
    length = 0
    class_lab = None
    for c in classes:
        if len(c)>length:
            length=len(c)
            class_lab = c
    class_lab = class_lab[target].unique()[0]
    
    a = pd.concat(classes)
    maj_class = a[a[target]==class_lab]

    lst.remove(class_lab)
    
    new_classes=[]
    for i in lst:
        new_classes.append(resample(data[data[target]==i],replace=True, n_samples=len(maj_class)))

    b = pd.concat(new_classes)
    upsample = pd.concat([a[a[target]==class_lab],b])

    return upsample

In [3]:
def target_encoding(data, column, target):
    
    grouped = data[[column,target]].groupby(column,as_index=False).mean()
    empty_dict = {}
    for i in range(len(grouped)):
        empty_dict[grouped.iloc[i,0]]=grouped.iloc[i,1]
    data[column]=data[column].map(lambda x: empty_dict[x])
    
    return data

In [4]:
def encode_cat(df, categorical_features):
    for cat in categorical_features:
        dummy_df = df[[cat,'Target']].groupby([cat],as_index=False).mean()
        dummy_dict = {}
        for i in range(len(dummy_df)):
            dummy_dict[dummy_df.iloc[i,0]]=float(dummy_df.iloc[i,1])
        df[cat] = df[cat].map(lambda x: dummy_dict[x])

In [5]:
lab1 = LabelEncoder()
lab2 = LabelEncoder()
lab3 = LabelEncoder()
tab1 = TargetEncoder()
tab2 = TargetEncoder()
tab3 = TargetEncoder()

In [6]:
start=dt.datetime.now()
print('Elapsed time: ',str(dt.datetime.now()-start))

Elapsed time:  0:00:00.000042


In [7]:
df = pd.read_csv('df_pipe')

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,volume,price_bin,price
0,0,0.23,Ideal,E,SI2,61.5,55.0,38.20203,0.0,326.0
1,1,0.21,Premium,E,SI1,59.8,61.0,34.505856,0.0,326.0
2,2,0.23,Good,E,VS1,56.9,65.0,38.076885,0.0,327.0
3,3,0.29,Premium,I,VS2,62.4,58.0,46.72458,0.0,334.0
4,4,0.31,Good,J,SI2,63.3,58.0,51.91725,0.0,335.0


In [9]:
ma = MaxAbsScaler()
ss = StandardScaler()

In [10]:
df_num = df.select_dtypes(exclude='O').drop(['price','price_bin'],axis=1)
df_cat = df.select_dtypes(include='O')
df_tar = df[['price','price_bin']]

In [11]:
df_num=pd.DataFrame(ma.fit_transform(df_num),columns=df.select_dtypes(exclude='O').drop(['price','price_bin'],axis=1).columns)

In [12]:
df2=pd.concat([df_num,df_cat,df_tar],axis=1)

In [13]:
df.drop('Unnamed: 0',axis=1,inplace=True)
df2.drop('Unnamed: 0',axis=1,inplace=True)

In [14]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,volume,price_bin,price
0,0.23,Ideal,E,SI2,61.5,55.0,38.20203,0.0,326.0
1,0.21,Premium,E,SI1,59.8,61.0,34.505856,0.0,326.0
2,0.23,Good,E,VS1,56.9,65.0,38.076885,0.0,327.0
3,0.29,Premium,I,VS2,62.4,58.0,46.72458,0.0,334.0
4,0.31,Good,J,SI2,63.3,58.0,51.91725,0.0,335.0


In [15]:
df2.head()

Unnamed: 0,carat,depth,table,volume,cut,color,clarity,price,price_bin
0,0.093878,0.922039,0.846154,0.094664,Ideal,E,SI2,326.0,0.0
1,0.085714,0.896552,0.938462,0.085505,Premium,E,SI1,326.0,0.0
2,0.093878,0.853073,1.0,0.094354,Good,E,VS1,327.0,0.0
3,0.118367,0.935532,0.892308,0.115783,Premium,I,VS2,334.0,0.0
4,0.126531,0.949025,0.892308,0.12865,Good,J,SI2,335.0,0.0


In [16]:
for col in df.select_dtypes(include='O').columns:
    df[col]=target_encoding(df,col,'price')

In [17]:
for col in df2.select_dtypes(include='O').columns:
    df2[col]=target_encoding(df2,col,'price')

In [18]:
df2.head()

Unnamed: 0,carat,depth,table,volume,cut,color,clarity,price,price_bin
0,0.093878,0.922039,0.846154,0.094664,0.0938776,0.0938776,0.0938776,326.0,0.0
1,0.085714,0.896552,0.938462,0.085505,0.0857143,0.0857143,0.0857143,326.0,0.0
2,0.093878,0.853073,1.0,0.094354,0.0938776,0.0938776,0.0938776,327.0,0.0
3,0.118367,0.935532,0.892308,0.115783,0.118367,0.118367,0.118367,334.0,0.0
4,0.126531,0.949025,0.892308,0.12865,0.126531,0.126531,0.126531,335.0,0.0


In [19]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,volume,price_bin,price
0,0.23,0.23,0.23,0.23,61.5,55.0,38.20203,0.0,326.0
1,0.21,0.21,0.21,0.21,59.8,61.0,34.505856,0.0,326.0
2,0.23,0.23,0.23,0.23,56.9,65.0,38.076885,0.0,327.0
3,0.29,0.29,0.29,0.29,62.4,58.0,46.72458,0.0,334.0
4,0.31,0.31,0.31,0.31,63.3,58.0,51.91725,0.0,335.0


In [20]:
X=df.drop(['price','price_bin'],axis=1)
y1=df.price_bin
y2=df.price

In [21]:
logreg=LogisticRegression(random_state=14)
cross_val_score(logreg, X, y1, cv=5, scoring='accuracy').mean()

0.8298591549295775

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y1)

In [23]:
upsample_train = upsample_classes(pd.concat([X_train,y_train],axis=1),'price_bin')

In [24]:
X_train=upsample_train.drop('price_bin',axis=1)
y_train=upsample_train.price_bin

In [25]:
pipe = Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
p=pipe.predict(X_test)

In [None]:
print(classification_report(p,y_test))

In [None]:
pipe = Pipeline([('scl', MinMaxScaler()),
                ('pca', PCA(n_components=7)),
                ('svm', SVC(random_state=14))])

grid = [{'svm__kernel': ['poly', 'sigmoid'],
         'svm__C': [0.01, 1, 100],
         'svm__degree': [2,3,4,5],
         'svm__gamma': [0.001, 0.01]}]

gridsearch = GridSearchCV(estimator=pipe,
                  param_grid=grid,
                  scoring='accuracy',
                  cv=3)

gridsearch.fit(X_train, y_train)

In [None]:
categorical_transformer = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',
                        fill_value='missing')),('onehot',OneHotEncoder(handle_unknown='ignore'))])

numeric_transformer = Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),('scaler',QuantileTransformer())])

cat_super_code = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
                            ('magic',encode_cat(df,categorical_features))])