## To apply any text featurization first we need to convert the raw data into meaningful data which is known as text preprocessing.

**Pre-processing steps include**
- Removing Noisy Data
- Tokenization
- Normalization

In [5]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')

import pickle
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/maria/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
dataset = pd.read_csv('valid.csv')
dataset.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552974,How to get all the child records from differen...,I am having 4 different tables like \r\nselect...,<sql><sql-server>,2016-01-01 01:44:52,LQ_EDIT
1,34554721,Retrieve all except some data of the another t...,I have two table m_master and tbl_appointment\...,<php><mysql><sql><codeigniter><mysqli>,2016-01-01 08:43:50,LQ_EDIT
2,34555135,Pandas: read_html,<p>I'm trying to extract US states from wiki U...,<python><pandas>,2016-01-01 09:55:22,HQ
3,34555448,Reader Always gimme NULL,"I'm so new to C#, I wanna make an application ...",<sql-server><c#-4.0>,2016-01-01 10:43:45,LQ_EDIT
4,34555752,php rearrange array elements based on condition,basically i have this array:\r\n\r\n array(...,<php>,2016-01-01 11:34:09,LQ_EDIT


In [30]:
X_train = dataset.loc[dataset["Tags"]=="<python>", "Title"]
Y_train = dataset.loc[dataset["Tags"]=="<python>", "Y"]

In [31]:
X_train.head()

19     i am new to pythn and was trying to fix indent...
375                   Python syntax error in identation?
481                                  Program doesn't run
677    list index is out of range - cannot find why -...
691                            Opening a .ipynb.txt File
Name: Title, dtype: object

In [32]:
Y_train.head()

19      LQ_EDIT
375    LQ_CLOSE
481    LQ_CLOSE
677     LQ_EDIT
691          HQ
Name: Y, dtype: object

In [33]:
len(Y_train)

270

In [34]:
from nltk.tokenize import word_tokenize

In [35]:
all_words = []
for sentence in X_train:
    tokenize_word = word_tokenize(sentence)
    for word in tokenize_word:
        all_words.append(word)

In [36]:
print(len(all_words))

2888


In [37]:
unique_words = set(all_words)
print(len(unique_words))

871


# Bag of Words 
(conversion of text to numerical form)

### CountVectorization function can convert text document to matrix of word count. The generated matrix is a sparsed matrix. 

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

# max_feature = 871 because 871 is the number of unique 
# words and we want these words to be converted to its numerical form.
# min_df = 5 is because minimum number of document that should
# include this feature.
# max_df = 0.7 is because words that occur in max of 70% of all documents 


vectorizer = CountVectorizer(max_features=871, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))

X = vectorizer.fit_transform(X_train).toarray()

In [39]:
X

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Evaluation of Model

In [40]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y_train, test_size=0.2, random_state=0)

In [41]:
# defining the classifier
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(x_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=0)

In [42]:
y_pred = classifier.predict(x_test)
y_pred

array(['LQ_CLOSE', 'LQ_EDIT', 'LQ_CLOSE', 'LQ_CLOSE', 'LQ_EDIT',
       'LQ_CLOSE', 'LQ_CLOSE', 'LQ_EDIT', 'LQ_CLOSE', 'LQ_CLOSE',
       'LQ_CLOSE', 'LQ_EDIT', 'LQ_CLOSE', 'LQ_EDIT', 'LQ_EDIT',
       'LQ_CLOSE', 'HQ', 'LQ_CLOSE', 'LQ_CLOSE', 'LQ_CLOSE', 'LQ_EDIT',
       'LQ_EDIT', 'LQ_CLOSE', 'LQ_EDIT', 'LQ_CLOSE', 'LQ_CLOSE',
       'LQ_CLOSE', 'LQ_CLOSE', 'LQ_CLOSE', 'LQ_CLOSE', 'LQ_CLOSE',
       'LQ_CLOSE', 'LQ_CLOSE', 'LQ_EDIT', 'LQ_CLOSE', 'LQ_CLOSE',
       'LQ_CLOSE', 'LQ_EDIT', 'LQ_CLOSE', 'LQ_CLOSE', 'LQ_EDIT',
       'LQ_CLOSE', 'LQ_CLOSE', 'LQ_CLOSE', 'LQ_CLOSE', 'LQ_CLOSE',
       'LQ_CLOSE', 'LQ_CLOSE', 'LQ_CLOSE', 'LQ_CLOSE', 'LQ_CLOSE',
       'LQ_CLOSE', 'LQ_CLOSE', 'LQ_EDIT'], dtype=object)

## Checking Accuracy

In [43]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))

0.6296296296296297
