In [33]:
!pip install contractions
!pip install textsearch
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv("nlp_train.csv")
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1707,bridge%20collapse,,Ashes 2015: AustraliaÛªs collapse at Trent Br...,0
1,5789,hail,"Carol Stream, Illinois",GREAT MICHIGAN TECHNIQUE CAMP\nB1G THANKS TO @...,1
2,7789,police,Houston,CNN: Tennessee movie theater shooting suspect ...,1
3,8257,rioting,,Still rioting in a couple of hours left until ...,1
4,10656,wounds,Lake Highlands,Crack in the path where I wiped out this morni...,0


In [35]:
dataset = pd.DataFrame({'Text':df['text'],'Target':df['target']})
dataset.head()

Unnamed: 0,Text,Target
0,Ashes 2015: AustraliaÛªs collapse at Trent Br...,0
1,GREAT MICHIGAN TECHNIQUE CAMP\nB1G THANKS TO @...,1
2,CNN: Tennessee movie theater shooting suspect ...,1
3,Still rioting in a couple of hours left until ...,1
4,Crack in the path where I wiped out this morni...,0


In [36]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5329 entries, 0 to 5328
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    5329 non-null   object
 1   Target  5329 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 83.4+ KB


In [37]:
import contractions
import numpy as np
import re
import unicodedata


def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

def remove_http(url):
  p= re.compile('(http(s)?|www)')
  url = p.sub('',url)
  return url

def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text


def pre_process_corpus(doc):
  doc = doc.translate(doc.maketrans("\n\t\r", "   "))
  doc = doc.lower()
  doc = remove_accented_chars(doc)
  doc = contractions.fix(doc)
  # lower case and remove special characters\whitespaces
  doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
  doc = re.sub(' +', ' ', doc)
  doc = doc.strip()  
  doc = remove_http(doc)
  doc = remove_stopwords(doc,is_lower_case=False)
  
  return doc

In [38]:
# build train and test datasets
text = dataset['Text']
labels = dataset['Target']


In [39]:
text_train= text.apply(pre_process_corpus)


In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_train, labels, test_size=0.3, random_state=42)

In [41]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
class_weight_dict = dict(enumerate(class_weights))
class_weight_dict

{0: 0.8822138126773889, 1: 1.1540841584158417}

In [42]:
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

In [43]:
# build TFIDF features on train reviews
tfidf = TfidfVectorizer(use_idf=True, min_df=2, max_df=1.0, ngram_range=(1,2), 
                     sublinear_tf=True)


In [44]:

model_pipeline = Pipeline([('vectorizer', tfidf), 
                                ('model',  SVC())])

model_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=2, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=True,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr

In [45]:
y_pred = model_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy: {} %'.format(100 * accuracy_score(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.77      0.90      0.83       910
           1       0.83      0.64      0.72       689

    accuracy                           0.79      1599
   macro avg       0.80      0.77      0.78      1599
weighted avg       0.79      0.79      0.78      1599

Accuracy: 78.79924953095684 %


In [46]:

from joblib import dump
dump(model_pipeline, 'text_classifier.joblib')

['text_classifier.joblib']

In [47]:
from joblib import load
sample_pipe = load('text_classifier.joblib')

In [48]:
sample_text = "air ambulance helicopter crashed, 12 people killed"

In [49]:
preprocess_text = pre_process_corpus(sample_text)

In [50]:
sample_pipe.predict([preprocess_text])[0]

1

In [51]:
def predict(text):
  preprocess_text = pre_process_corpus(text)
  idx = sample_pipe.predict([preprocess_text])[0]
  return idx

In [52]:
predict("air ambulance helicopter crashed, 12 people killed")

1

In [53]:
predict("Hello, how are you doing? how can I help you?")

0