In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, ShuffleSplit
from sklearn.preprocessing import *
from sklearn.decomposition import PCA
from sklearn.linear_model import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.datasets import make_regression
from sklearn.metrics import *
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from xgboost import XGBClassifier, XGBRegressor
from imblearn.datasets import make_imbalance
from category_encoders.target_encoder import TargetEncoder
import statsmodels.api as sm
import datetime as dt

In [None]:
def upsample_classes(data, target):
    
    labels = list(data[target].unique())
    i = 0
    lst = []
    while len(lst) < 3:
        lst.append(labels[i])
        i+=1
    
    classes = []
    for c in lst:
        classes.append(data[data[target]==c])
    
    length = 0
    class_lab = None
    for c in classes:
        if len(c)>length:
            length=len(c)
            class_lab = c
    class_lab = class_lab[target].unique()[0]
    
    a = pd.concat(classes)
    maj_class = a[a[target]==class_lab]

    lst.remove(class_lab)
    
    new_classes=[]
    for i in lst:
        new_classes.append(resample(data[data[target]==i],replace=True, n_samples=len(maj_class)))

    b = pd.concat(new_classes)
    upsample = pd.concat([a[a[target]==class_lab],b])

    return upsample

In [None]:
def target_encoding(data, column, target):
    
    grouped = data[[column,target]].groupby(column,as_index=False).mean()
    empty_dict = {}
    for i in range(len(grouped)):
        empty_dict[grouped.iloc[i,0]]=grouped.iloc[i,1]
    data[column]=data[column].map(lambda x: empty_dict[x])
    
    return data

In [None]:
def encode_cat(df, categorical_features):
    for cat in categorical_features:
        dummy_df = df[[cat,'Target']].groupby([cat],as_index=False).mean()
        dummy_dict = {}
        for i in range(len(dummy_df)):
            dummy_dict[dummy_df.iloc[i,0]]=float(dummy_df.iloc[i,1])
        df[cat] = df[cat].map(lambda x: dummy_dict[x])

In [None]:
lab1 = LabelEncoder()
lab2 = LabelEncoder()
lab3 = LabelEncoder()
tab1 = TargetEncoder()
tab2 = TargetEncoder()
tab3 = TargetEncoder()

In [None]:
start=dt.datetime.now()
print('Elapsed time: ',str(dt.datetime.now()-start))

In [None]:
df = pd.read_csv('df_pipe')

In [None]:
df.head()

In [None]:
ma = MaxAbsScaler()
ss = StandardScaler()

In [None]:
df_num = df.select_dtypes(exclude='O').drop(['price','price_bin'],axis=1)
df_cat = df.select_dtypes(include='O')
df_tar = df[['price','price_bin']]

In [None]:
df_num=pd.DataFrame(ma.fit_transform(df_num),columns=df.select_dtypes(exclude='O').drop(['price','price_bin'],axis=1).columns)

In [None]:
df2=pd.concat([df_num,df_cat,df_tar],axis=1)

In [None]:
df.drop('Unnamed: 0',axis=1,inplace=True)
df2.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df2.head()

In [None]:
for col in df.select_dtypes(include='O').columns:
    df[col]=target_encoding(df,col,'price')

In [None]:
for col in df2.select_dtypes(include='O').columns:
    df2[col]=target_encoding(df2,col,'price')

In [None]:
df2.head()

In [None]:
df.head()

In [None]:
X=df.drop(['price','price_bin'],axis=1)
y1=df.price_bin
y2=df.price

In [None]:
logreg=LogisticRegression(random_state=14)
cross_val_score(logreg, X, y1, cv=5, scoring='accuracy').mean()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y1)

In [None]:
upsample_train = upsample_classes(pd.concat([X_train,y_train],axis=1),'price_bin')

In [None]:
X_train=upsample_train.drop('price_bin',axis=1)
y_train=upsample_train.price_bin

In [None]:
pipe = Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
p=pipe.predict(X_test)

In [None]:
print(classification_report(p,y_test))

In [None]:
pipe = Pipeline([('scl', MinMaxScaler()),
                ('pca', PCA(n_components=7)),
                ('svm', SVC(random_state=14))])

grid = [{'svm__kernel': ['poly', 'sigmoid'],
         'svm__C': [0.01, 1, 100],
         'svm__degree': [2,3,4,5],
         'svm__gamma': [0.001, 0.01]}]

gridsearch = GridSearchCV(estimator=pipe,
                  param_grid=grid,
                  scoring='accuracy',
                  cv=3)

gridsearch.fit(X_train, y_train)