### Import Libraries

In [3]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.util import ngrams
from collections import Counter
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 2000)

In [2]:
def create_corpus(text):
    review = re.sub('[^a-zA-Z]', ' ', text)
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    return review

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\antoi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Import data

In [4]:
war = pd.read_csv('war_clean.csv')
politics = pd.read_csv('politics_clean.csv')
health = pd.read_csv('health_clean.csv')
climate = pd.read_csv('climate_clean.csv')

In [5]:
war = war.sample(9000)
politics = politics.sample(9000)
health = health.sample(9000)
climate = climate.sample(9000)

In [6]:
df = pd.concat([war,climate,health,politics])#,displ])

In [7]:
df['1_grams'] = df['1_grams'].apply(lambda x :[k[1:-1] for k in x[1:-1].split(", ")])
df['2_grams'] = df['2_grams'].apply(lambda x :[k[1:-1] for k in x[1:-1].split(", ")])
df['3_grams'] = df['3_grams'].apply(lambda x :[k[1:-1] for k in x[1:-1].split(", ")])
df['4_grams'] = df['4_grams'].apply(lambda x :[k[1:-1] for k in x[1:-1].split(", ")])

In [8]:
l_1grams = []
l_2grams = []
l_3grams = []
l_4grams = []
for i in range(df.shape[0]):
    l_1grams += df.iloc[i,7]
    l_2grams += df.iloc[i,4]
    l_3grams += df.iloc[i,5]
    l_4grams += df.iloc[i,6]

In [9]:
mlabel1 = MultiLabelBinarizer()
X1 = mlabel1.fit_transform(df['1_grams'])
mlabel2 = MultiLabelBinarizer()
X2 = mlabel2.fit_transform(df['2_grams'])
mlabel3 = MultiLabelBinarizer()
X3 = mlabel3.fit_transform(df['3_grams'])
mlabel4 = MultiLabelBinarizer()
X4 = mlabel4.fit_transform(df['4_grams'])


In [10]:
X = np.concatenate([X1,X2,X3,X4],axis = 1)#,X3,X4],axis = 1)

In [11]:
y = df['Category']

### Classifier simple

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

#### Logistic regression

In [13]:
LGclassifier = LogisticRegression(random_state = 0)
LGclassifier.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
y_predLR = LGclassifier.predict(X_test)
y_predLR_proba = LGclassifier.predict_proba(X_test)

##### Accuracy Logistic Regression

In [16]:
accLR = accuracy_score(y_test, y_predLR, normalize = True)
accLR

0.9863888888888889

##### Confusion Matrix Logistic Regression

In [17]:
cmLR = confusion_matrix(y_test,y_predLR)
cmLR

array([[1775,    0,    5,    3],
       [   8, 1785,   11,    3],
       [  20,    2, 1788,   15],
       [  13,    0,   18, 1754]], dtype=int64)