In [6]:
from xgboost import XGBClassifier
import pandas as pd
import torch
from sklearn.model_selection import KFold,cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import make_scorer,accuracy_score, precision_score, recall_score, f1_score
import os
import numpy as np
import tuning_script

In [7]:
def read_data(filename,samp_size):
    current_dir = os.getcwd()
    parent_dir = os.path.dirname(current_dir)
    path = parent_dir +"/model_data/" + filename
    #Sampling data
    if samp_size > 1:
        samp_size = 1
    if samp_size <=0:
      samp_size = .1

    df = pd.read_csv(path)
    if int(len(df) * samp_size) <= 5000:
        df = df.sample(n=int(len(df) * samp_size))
    else:
        df = df.sample(n=5000)  
   
    df.reset_index(inplace=True,drop=True)

    return df

def encode_Test_Lables(label):
    return int(label-1)

In [8]:
def runModel(filename,samp_size,estimators,lr,l2):
    df = read_data(filename,samp_size)

    device = "cpu"
    gpu_available = torch.cuda.is_available()
    if gpu_available:
        device = 'cuda'

    # Convert text data into numerical features using TF-IDF
    vectorizer = TfidfVectorizer(max_df=.95,min_df=.0125)
    x = vectorizer.fit_transform(df["Review_text"])
    y = df["Rating"].apply(encode_Test_Lables)

    # create model instance
    bst = XGBClassifier(n_estimators = estimators,learing_rate = lr, reg_delta = l2,booster='gblinear',objective='multi:softmax',device = device)

    #Set folds and Scoring
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score,average="micro",zero_division=0.0),
    'recall': make_scorer(recall_score,average="micro",zero_division=0.0),
    'f1_score': make_scorer(f1_score,average="micro",zero_division=0.0)
    }
    
    #run model
    cv_results = cross_validate(bst, x, y, cv=kf, scoring=scoring)

    #print results
    print("Model Accuracy: {0:.2%} Precision: {1:.4f} Recall: {2:.4f} F1 Score: {3:.4f}".format(np.mean(cv_results['test_accuracy']),np.mean(cv_results['test_f1_score']),np.mean(cv_results['test_precision']),np.mean(cv_results['test_recall'])))


In [9]:
def run_Experiment(filename,samp_size,estimators,lr,l2,max_df,min_df):
    df = read_data(filename,samp_size)

    device = "cpu"
    gpu_available = torch.cuda.is_available()
    if gpu_available:
        device = 'cuda'

    # Convert text data into numerical features using TF-IDF
    vectorizer = TfidfVectorizer(max_df = max_df,min_df=min_df)
    x = vectorizer.fit_transform(df["Review_text"])
    y = df["Rating"].apply(encode_Test_Lables)

    # create model instance
    bst = XGBClassifier(n_estimators = 5,reg_lambda = .01, learning_rate = .003,booster='gblinear',objective='multi:softmax',device = device)
    paramaters = {
        "n_estimators" : estimators,
        "learning_rate": lr,
        "reg_lambda": l2,
    }

    tuning_script.grid_search(bst,paramaters,x,y)

In [None]:
lr = [.0001,.0005,.001]
estimators = [5,10,15,20]
l2 = [.552,.652,.753,.852]
max_df = .97
min_df = .001
run_Experiment('data_set_2.csv',.5,estimators,lr,l2,max_df,min_df)

Best Hyper paramaters:  {'learning_rate': 0.001, 'n_estimators': 5, 'reg_lambda': 0.852}
Best Scores: 76.60%
