In [None]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from nltk import word_tokenize 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
traindir = '/content/drive/MyDrive/train.csv'
testdir = '/content/drive/MyDrive/test.csv'

In [None]:
readTrain = pd.read_csv(traindir)

In [None]:
readTrain

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [None]:
#pre-processing dat
import re 


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    
    lower = sentence.lower()
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',lower)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = re.sub(r'[^a-zA-Z0-9]', ' ', cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


In [None]:
readTrain['cleaned-text'] = readTrain['text'].apply(cleanPunc)

In [None]:
readTrain['text'].iloc[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [None]:
readTrain['cleaned-text'].iloc[0]

'our deeds are the reason of this earthquake may allah forgive us all'

In [None]:
#stopwords

stopwords = ['as', 'in', 'of', 'is', 'are', 'were', 'was', 'it']


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:

def remove_stopwords(sentence):
  tokenize_words = word_tokenize(sentence)
  temp = [word for word in tokenize_words if word not in stopwords]
  temp = ' '.join(temp)

  return temp 

In [None]:
readTrain['cleaned-text2'] = readTrain['cleaned-text'].apply(remove_stopwords)

In [None]:
readTrain['cleaned-text2'].iloc[0]

'our deeds the reason this earthquake may allah forgive us all'

In [None]:
df = readTrain[['cleaned-text2', 'target']]

In [None]:
df

Unnamed: 0,cleaned-text2,target
0,our deeds the reason this earthquake may allah...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter place being not...,1
3,13 000 people receive wildfires evacuation ord...,1
4,just got sent this photo from ruby alaska smok...,1
...,...,...
7608,two giant cranes holding a bridge collapse int...,1
7609,aria ahrary thetawniest the out control wild f...,1
7610,m1 94 01 04 utc 5km s volcano hawaii http t co...,1
7611,police investigating after an e bike collided ...,1


In [None]:
def remove_numbers(sentence):
  pattern = r'[0-9]'

# Match all digits in the string and replace them with an empty string
  new_string = re.sub(pattern, '', sentence)
  return new_string

In [None]:
readTrain['cleaned-text2'] = readTrain['cleaned-text2'].apply(remove_numbers)

In [None]:
readTrain['cleaned-text2'].iloc[3]

'  people receive wildfires evacuation orders california'

In [None]:
X = readTrain['cleaned-text2']
y = readTrain['target']

In [None]:
#train-test-split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10 )

In [None]:
X_test

6524    ddnewslive nitishkumar and arvindkejriwal cant...
701                         acousticmaloley no he blazing
3119    do babies actually get electrocuted from wall ...
4204    precious cargo onesie recalled for choking haz...
1651    my portable closet has collapsed x and finally...
                              ...                        
1985    i liked a youtube video http t co tbxcakdrw gt...
6291    doves the storm greatest denier electric proms...
850     chief cg nah young blood that cook gone im cut...
6126    i just made a weird high pitched noise and the...
5782    if that would been a black dude antioch would ...
Name: cleaned-text2, Length: 1904, dtype: object

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_vect, y_train)


In [None]:
y_pred = clf.predict(X_test_vect)

In [None]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.7830882352941176

<5709x17228 sparse matrix of type '<class 'numpy.float64'>'
	with 77742 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[1000,   83],
       [ 330,  491]])

In [None]:
feature_names

array(['aa', 'aaaa', 'aaaaaaallll', ..., 'zztbvjypn', 'zzudilrea', 'zzzz'],
      dtype=object)

In [None]:
from matplotlib import pyplot as plt
def f_importances(coef, names, top=-1):
    imp = coef
    imp, names = zip(*sorted(list(zip(imp, names))))

    # Show all features
    if top == -1:
        top = len(names)
            
    plt.figure(figsize=(15,8))
    plt.barh(range(top), imp[::-1][0:top], align='center')
    plt.yticks(range(top), names[::-1][0:top])
    plt.show()

In [None]:
feature_names = vectorizer.get_feature_names_out()

In [None]:
coefs_with_fns = sorted(zip(clf.feature_log_prob_[0], feature_names)) 
df=pd.DataFrame(coefs_with_fns)
df.columns='coefficient','word'
df.sort_values(by='coefficient')

Unnamed: 0,coefficient,word
0,-10.246826,aaaaaaallll
3901,-10.246826,profittothepeople
3900,-10.246826,professionally
3899,-10.246826,produces
3898,-10.246826,process
...,...,...
17223,-5.885825,you
17224,-5.731339,to
17225,-5.617344,http
17226,-5.528622,co


In [None]:
#Steps for NLP

1. Text Cleaning
  1.1 Text Standardization
  1.2. Eliminating unneccesary words
  1.3 converting non-words to words
  1.4 convert negative words to antonyms
  1.5 stemming & lemma 

  1.6 results 


2. EDA/Feature Engineering 

3. Text to numerical conversion 
  3.1 pre-conversion 
  3.2 conversion (TDIFD)
  3.3 post-conversion dimensionality reduction

4. ML-modelling


 



In [None]:
!pip install pyspellchecker

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspellchecker
  Downloading pyspellchecker-0.7.2-py3-none-any.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m64.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [None]:
from spellchecker import SpellChecker

def spellcheck(sentence):
  spell = SpellChecker()

  corrected_words = []

  misspelled_words = spell.unknown(sentence.split())

  for word in sentence.split():
    if word in misspelled_words:
      corrected_words.append(spell.correction(word))
    else:
      corrected_words.append(word)
    
  return ' '.join(corrected_words)



In [None]:
text = 'speling corection'

In [None]:
text = spellcheck(text)

In [None]:
text

'spelling correction'

In [None]:
from nltk.stem.porter import PorterStemmer

text = 'i wanted wanting'

In [None]:
stemmer = PorterStemmer()
def stem(sentence):
  return ' '.join([stemmer.stem(word) for word in sentence.split()])

In [None]:
text = stem(text)

In [None]:
text

'i want want'

In [None]:
re.sub(' +', ' ', text)

In [None]:
c = 'a b c'

In [None]:
c.split()

['a', 'b', 'c']