In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Basic Feature Engineering

## 1.0 Concatenating Data (get foreign key data)

In [2]:
df = pd.read_csv("./csv_files/edited_csv.csv", index_col=0)
df_sample = pd.read_csv("./csv_files/sample_cleaned.csv", index_col=0)

In [3]:
products = pd.read_json('./json_files/products_table.json', lines=True)

In [4]:
questions_json = pd.read_json('./json_files/questions_table.json', lines=True)

In [5]:
questions_df = questions_json[questions_json['id'].isin(df.index)]

In [6]:
questions_df['label'] = df['label']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [7]:
questions_df.to_csv('./csv_files/questions_labeled_table.csv')

In [8]:
questions_df = questions_df.reset_index().drop(['index','id','is_good','status'],axis=1)

In [9]:
questions_df['description'] = questions_df.product_id.map(lambda x: ' '.join(products[products['id']==x].description.values))

In [10]:
questions_df.to_csv('./csv_files/questions_first_features.csv')

# 2.0 Words Treatment

In [71]:
questions_df = pd.read_csv('./csv_files/questions_first_features.csv', index_col=0)

In [126]:
import nltk
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from string import punctuation
import string
import re
from nltk import tokenize

In [15]:
features = pd.DataFrame(index=questions_df.index)

In [16]:
features['product_id'] = questions_df['product_id']

In [17]:
features['question_len'] = questions_df.questions.apply(len)

In [99]:
def remove_punct(text):
    text = str(text)
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', ' ', text)
    
    return text.lower()

def remove_stops(text):
    clean = [word for word in text.split() if word.lower() not in stopwords.words('portuguese')]
    return ' '.join(clean)


In [73]:
questions_df.questions = questions_df.questions.str.replace(',',' ')
questions_df.answers = questions_df.answers.str.replace(',',' ')
questions_df.description = questions_df.description.str.replace(',', ' ')

In [88]:
no_punct_questions = questions_df.questions.apply(remove_punct)
no_punct_answers = questions_df.answers.apply(remove_punct)
no_punct_desc = questions_df.description.apply(remove_punct)


In [106]:
no_stop_questions = no_punct_questions.apply(remove_stops)
no_stop_answers = no_punct_answers.apply(remove_stops)
no_stop_desc = no_punct_desc.apply(remove_stops)

In [107]:
features['questions_cleaned'] = no_stop_questions
features['answers_cleaned'] = no_stop_answers
features['desc_cleaned'] = no_stop_desc

In [113]:
features['questions_cleaned_len'] = features.questions_cleaned.apply(len)

In [120]:
features.to_csv('./csv_files/featuresDF_cleaned.csv')

# 2.1 - Vectorize strings field

In [210]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
import scipy.sparse

In [123]:
features_vec = pd.DataFrame(index=features.index)

In [128]:
target = questions_df.label

In [129]:
Xtrain, Xval, ytrain, yval = train_test_split(features,target,random_state=0, test_size=0.4)

In [134]:
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((550, 6), (368, 6), (550,), (368,))

In [150]:
strings_train = Xtrain[['questions_cleaned', 'answers_cleaned', 'desc_cleaned']]
strings_val = Xval[['questions_cleaned', 'answers_cleaned', 'desc_cleaned']]

In [260]:
vectorizer = TfidfVectorizer(min_df=2)

questions_bow_train = vectorizer.fit_transform(strings_train.questions_cleaned)
questions_bow_val = vectorizer.transform(strings_val.questions_cleaned)

answers_bow_train = vectorizer.fit_transform(strings_train.answers_cleaned)
answers_bow_val = vectorizer.transform(strings_val.answers_cleaned)

desc_bow_train = vectorizer.fit_transform(strings_train.desc_cleaned)
desc_bow_val = vectorizer.transform(strings_val.desc_cleaned)



In [266]:
Xtrain_noStrings = Xtrain.drop(strings_train,axis=1)
Xval_noStrings = Xval.drop(strings_val, axis=1)

In [267]:
Xtrain_wBows = hstack([Xtrain_noStrings, questions_bow_train, answers_bow_train, desc_bow_train])
Xval_wBows = hstack([Xval_noStrings, questions_bow_val, answers_bow_val, desc_bow_val])

In [268]:
Xtrain_wBows, Xval_wBows

(<550x3522 sparse matrix of type '<class 'numpy.float64'>'
 	with 43496 stored elements in COOrdinate format>,
 <368x3522 sparse matrix of type '<class 'numpy.float64'>'
 	with 27127 stored elements in COOrdinate format>)

In [269]:
scipy.sparse.save_npz('./npz/features_bow_train.npz', Xtrain_wBows)
scipy.sparse.save_npz('./npz/features_bow_val.npz', Xval_wBows)

In [282]:
Xtrain_qBow = hstack([Xtrain_noStrings, questions_bow_train])
Xval_qBow = hstack([Xval_noStrings, questions_bow_val])

In [289]:
scipy.sparse.save_npz('./npz/features_questions_train.npz', Xtrain_qBow)
scipy.sparse.save_npz('./npz/features_questions_vak.npz', Xval_qBow)

# 3.0 - Model 01

In [270]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [283]:
mdl = RandomForestClassifier(n_estimators=1000,random_state=0, class_weight='balanced')
mdl.fit(Xtrain_qBow,ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [285]:
p = mdl.predict(Xval_qBow)
proba = mdl.predict_proba(Xval_qBow)[:,0]

In [276]:
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score

In [286]:
print(classification_report(yval,p))

              precision    recall  f1-score   support

         0.0       0.84      0.64      0.73       176
         1.0       0.73      0.89      0.80       192

    accuracy                           0.77       368
   macro avg       0.78      0.76      0.76       368
weighted avg       0.78      0.77      0.76       368



In [287]:
roc_auc_score(yval,proba), average_precision_score(yval,proba)

(0.15789240056818182, 0.3518864709453402)

In [288]:
precision_score(yval,p), recall_score(yval,p)

(0.7296137339055794, 0.8854166666666666)

# 3.1 - KFold

In [None]:
from sklearn.model_selection import KFold

In [None]:
kf = KFold(n_splits=3, shuffle=True, random_state=0)
X=df_edited.questions
y=df_edited.label