In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.svm import SVC
import pickle
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [None]:
def clean_data(file_name):
      df = pd.read_csv(file_name)

      # drop all NA value
      col = [col for col in df.columns]
      df.dropna(subset=col, inplace=True)

      #imputing data for numerical value


      # replace the string inside salary into 'B' and 'M"
      df["salary"] = df["salary"].replace(['Less than 1K','1K to 2K','2K to 3K','3K to 4K','4K to 5K'],'B')
      df["salary"] = df["salary"].replace(['5K to 6K','7K to 8K','8K to 9K','9K to 10K','10K or more'],"M")

      # replace string inside 'person_living_in_house' feature
      df["person_living_in_house"] = df["person_living_in_house"].replace(['10 or more'],'10')

      df['kids_spending'] = df['kids_spending'].str.replace(' ', "")
      df['house_utility'] = df['house_utility'].str.replace(' ', "")

      return df

def column_transformer():
      ohe = OneHotEncoder()

      column_trans = make_column_transformer(
       (ohe, ['race', 'gender', 'employment', 'education', 'married',
       'house_type', 'house_value','vehicle', 'transport_use']), remainder='passthrough')

      return column_trans

def my_pipeline():
      column_trans = column_transformer()
      svm = SVC(random_state=42, C=1, gamma='scale')
      pipe = make_pipeline(column_trans, svm) 
      return pipe 

def execute_SVM(file_to_be_train, predicted_file):
      '''
      Create  model using surveyA.csv
      '''
      # for train dataset
      df_clean = clean_data(file_to_be_train)

      # X_value
      X = df_clean.drop('salary', axis=1)

      # y_value
      y = df_clean['salary']

      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # add stratify to ensure the fairness of distribution

      # pipeline
      pipe = my_pipeline()
      # model learn the data 
     
      pipe.fit(X_train, y_train)
      
      # ---------------------------------------------------------------

      # for new dataset to be predicted
      # method:
      # 1- clean the data using clean_data()
      # 2- predict using pipe.predict(data)
      df_clean_pred = clean_data(predicted_file)
      X_from_pred = df_clean_pred.drop('salary', axis=1)
      
      y_predict_new = pipe.predict(X_from_pred)

   
      return y_predict_new

execute_SVM("surveyA.csv", "surveyA_sample.csv")

# Create pickle model

In [None]:
# generate pickle model 
def create_pickle(file_to_be_train):
    # for train dataset
    df_clean = clean_data(file_to_be_train)

    # X_value
    X = df_clean.drop('salary', axis=1)

    # y_value
    y = df_clean['salary']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    pipe = my_pipeline()
    pipe.fit(X_train, y_train)
    my_pickle = pickle.dump(pipe, open("INVOKE_ML_Ammar.pkl", 'wb'))

    return my_pickle

create_pickle("surveyA.csv")

In [None]:
def pred_new_data(new_data):
    df_clean = clean_data(new_data)
    X = df_clean.drop('salary', axis=1)
    loaded_model = pickle.load(open("INVOKE_ML_Ammar.pkl", 'rb'))
    
    result = loaded_model.predict(X)
    return result

pred_new_data("surveyA_sample.csv")