In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
import requests
import re
from collections import Counter
import seaborn as sns
import nltk
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [6]:
df = pd.read_csv('Training_set.csv')

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,created_at,text_clean,label
0,0,2018-07-23 14:42:01,There are ways for Colorado residents to stret...,1
1,1,2018-07-23 14:20:16,RT Residents flee wildfire raging west of Athens,1
2,2,2018-07-23 14:08:16,Canada B C s Okanagan braces for more wildfires,1
3,3,2018-07-23 14:11:44,Evacuation orders as Ontario s wildfires keep ...,1
4,4,2018-07-23 08:21:09,RT How rewildling can prevent wildfires,1


In [8]:
df.dtypes

Unnamed: 0     int64
created_at    object
text_clean    object
label          int64
dtype: object

In [9]:
df['created_at']=pd.to_datetime(df['created_at'])

In [10]:
df.dtypes

Unnamed: 0             int64
created_at    datetime64[ns]
text_clean            object
label                  int64
dtype: object

In [11]:
df.set_index('created_at',inplace=True)
df.sort_index(inplace=True)

In [12]:
p_stemmer=PorterStemmer()

def text_to_words(raw_text):
    text = BeautifulSoup(raw_text).get_text()
    letters_only = re.sub('[^a-zA-Z]',' ',text)
    words = letters_only.lower().split()
    stops = set(stopwords.words('english'))
    meaningful_words = [w for w in words if w not in stops]
    stem_words = [p_stemmer.stem(i) for i in meaningful_words]
    return (' '.join(stem_words))

In [13]:
clean_train_alltext = []

for train_alltext in df['text_clean']:
    clean_train_alltext.append(text_to_words(train_alltext))

In [14]:
pipe=Pipeline([
    ('cvec',CountVectorizer()),
    ('lr',LogisticRegression(max_iter=250))
])

pipe_params = {
    'cvec__max_features':[2500,3000,3500],
    'cvec__min_df':[2,3],
    'cvec__max_df':[.9,.95],
    'cvec__ngram_range':[(1,1),(1,2)]
}

gs = GridSearchCV(pipe,
                 param_grid=pipe_params,
                 cv=3)

In [15]:
gs.fit(clean_train_alltext,df.label)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [16]:
gs.best_score_

0.9968259450450877

In [17]:
pipe_t=Pipeline([
    ('tf',TfidfVectorizer()),
    ('lr',LogisticRegression())
])

pipe_t_params = {
    'tf__max_features':[2500,3000,3500],
    'tf__min_df':[2,3],
    'tf__max_df':[.9,.95],
    'tf__ngram_range':[(1,1),(1,2)]
}

gs_t = GridSearchCV(pipe_t,
                 param_grid=pipe_t_params,
                 cv=3)

In [19]:
gs_t.fit(clean_train_alltext,df.label)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        norm

In [20]:
gs_t.best_score_

0.984130067862745