In [1]:
import sys
import argparse
import logging
import io
import os
import re
import pandas as pd
import math
import random
import numpy as np
from collections import Counter
import json
import requests
import base64 
import Levenshtein as lev
import yaml
import unicodedata
from datetime import datetime, timedelta
pd.options.mode.chained_assignment = None
pd.set_option('display.max_colwidth', -1)
from ast import literal_eval



In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import class_weight
from joblib import dump, load

In [3]:
df = pd.read_csv('data/1.csv', encoding='utf-8')
df1, df2 = df[df['col']==1],df[df['col']==2]
df1, df2 = df1.assign(label=0), df1.assign(label=1)
data = pd.concat([df1, df2]).sample(frac=1).reset_index(drop=True)

In [31]:
# split to train, val and test
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)
print(len(X_train),len(y_train),len(X_val),len(y_val),len(X_test),len(y_test))

112016 112016 48008 48008 68583 68583


In [32]:
# convert text to features
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=100000, ngram_range=(2,2))
X_train = vectorizer.fit_transform(X_train)
X_val, X_test = vectorizer.transform(X_val), vectorizer.transform(X_test)

In [11]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,auc,roc_auc_score
# import scikitplot as skplt
import matplotlib.pyplot as plt

#define a function for performance metrics
def model_train_eval(model,X_train,y_train, X_test,y_test):
    print (model)
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    print ("prediciton Accuracy : %f" % accuracy_score(y_test, pred))
    print ("Confusion_matrix : ")
    print (confusion_matrix(y_test,pred))
    print ("classification report : ")
    print (classification_report(y_test, pred, labels=[0, 1]))
        
    if not str(model)[:3] == "SGD":
        pred_proba = model.predict_proba(X_test)
        pred_proba_c1 = pred_proba[:,1]
        print ("AUC Score : %f" % roc_auc_score(y_test, pred_proba_c1))
    return model

def model_test_eval(y_test, pred, pred_proba=None):
    print ("prediciton Accuracy : %f" % accuracy_score(y_test, pred))
    print ("Confusion_matrix : ")
    print (confusion_matrix(y_test,pred))
    print ("classification report : ")
    print (classification_report(y_test, pred, labels=[0, 1]))
        
    if pred_proba is not None and sum(y_test) not in (len(y_test), 0):
        print ("AUC Score : %f" % roc_auc_score(y_test, pred_proba))

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegressionCV

NB = MultinomialNB()
LRcv = LogisticRegressionCV(solver="libnear",penalty = "l1",cv = 5,random_state = 42,verbose=2,
                           max_iter=5, class_weight='balanced')  

In [None]:
model = model_train_eval(LRcv, X_train, y_train, X_val,y_val)

In [34]:
pred, pred_proba = model.predict(X_test), model.predict_proba(X_test)
model_test_eval(y_test, pred, pred_proba)

[[56079   137]
 [ 1051 11316]]


In [15]:
dump(vectorizer, 'model/vectorizer_v1_2gram.pkl')

['model/vectorizer_v1_2gram.pkl']

In [16]:
dump(model, 'model/lr_v1_2gram.joblib') 

['model/lr_v1_2gram.joblib']