<a href="https://colab.research.google.com/github/Bogdan-Strat/NLP-with-disaster-tweets/blob/main/NLP_with_disaster_tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
! pip install num2words
! pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.9/240.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234911 sha256=384df0fb3f7e7f39da2de26851a9f976be42648ffa1f014714d6a0efdaeccfc8
  Stored in directory: /root/.cache/pip/wheels/02/3d/88/51a592b9ad17e7899126563698b4e3961983ebe85747228ba6
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.2.0


In [31]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from num2words import num2words
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import emoji
import string

In [4]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [30]:
emoticon_string = r"""
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      </?3                       # heart
    )"""

In [33]:
# Preprocessing function
def lower_text(tweet):
    return tweet.lower()

def convert_number_to_words(tweet):
  tweet_num2words = []
  data = tweet.split()

  for text in data:
    if ',' in text:
            parts = text.split(',')
            converted_parts = []
            for part in parts:
                if part.isdigit():
                    number_without_comma = int(part)
                    converted_parts.append(num2words(number_without_comma))
                else:
                    converted_parts.append(part)
            converted_word = ' '.join(converted_parts)
            tweet_num2words.append(converted_word)
    elif text.isdigit():
      tweet_num2words.append(num2words(text))
    else:
      tweet_num2words.append(text)
  return ' '.join(tweet_num2words)

def remove_links(tweet):
  return ' '.join([re.sub(r'http\S+', '', word) for word in tweet.split(" ")])
 
def remove_emoticons_and_emojis(tweet):
  emoticon_re = re.compile(emoticon_string, re.VERBOSE | re.I | re.UNICODE)
  tweet = ' '.join([re.sub(emoticon_re, '', word) for word in tweet.split(" ")])
  return emoji.replace_emoji(tweet, replace='')
 
def remove_hashtags_and_mentions(tweet):
  tweets_no_hashtags = [re.sub(r'#[a-zA-Z0-9_]+','', word) for word in tweet]
  return ' '.join([re.sub(r'@[a-zA-Z0-9_]+','', word) for word in tweets_no_hashtags])
 
def remove_multiple_spaces(tweet):
  return  ' '.join([re.sub(r'\s+', ' ', word).strip() for word in tweet.split(" ")])
 
'''
Aveti grija la cazurile de tipul "unu,doi". Daca eliminati punctuatia direct, cele doua cuvinte vor 
fi concatenate obtinand un singur cuvant "unudoi". O alternativa ar fi sa inlocuim 
mai intai toate caracterele de punctuatie cu spatiu, apoi sa aplicam inca o data metoda de contractie a spatiilor.
'''
 
def remove_punctuation(tweet):
  tweets_no_punct = [re.sub(r'[^\w\s]', ' ', word) for word in tweet.split(" ")]
  return ' '.join([re.sub(r'\s+', ' ', tweet) for tweet in tweets_no_punct])

def preprocess(tweet):
  tweet = lower_text(tweet)
  tweet = convert_number_to_words(tweet)
  tweet = remove_links(tweet)
  tweet = remove_emoticons_and_emojis(tweet)
  tweet = remove_multiple_spaces(tweet)
  tweet = remove_punctuation(tweet)
 
  return tweet

In [34]:
def remove_stopwords(tweet):
  stop_words_nltk = set(stopwords.words('english'))
  all_words = [word for word in tweet]
  all_words_without_stops = [word for word in all_words if word not in stop_words_nltk]
  return all_words_without_stops

def lematizer(tweet):
  words_lemmatize = []
  stemmer = SnowballStemmer(language='english')
  for token in tweet:
    words_lemmatize.append(stemmer.stem(token))
  return words_lemmatize

def tokenizer(tweet):
  tweet = word_tokenize(tweet)
  tweet = remove_stopwords(tweet)
  tweet = lematizer(tweet)
  return tweet

In [36]:
# Load the data
data = pd.read_csv("train.csv")  # Assuming you have a CSV file with columns id, body, and label
data

# Split the data into features (email body) and labels (spam or not spam)
X = data['text']
y = data['target']
tokenizer(X[3])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the email bodies with lowercase preprocessing
vectorizer = CountVectorizer(preprocessor=preprocess, tokenizer=tokenizer)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

# Predict on the test set
predictions = nb_classifier.predict(X_test_vectorized)

# Evaluate the classifier
accuracy = (predictions == y_test).mean()
print(f"Accuracy: {accuracy}")



Accuracy: 0.7918581746552856


bin	 dev   lib32   mnt			 python-apt  srv    usr
boot	 etc   lib64   NGC-DL-CONTAINER-LICENSE  root	     sys    var
content  home  libx32  opt			 run	     tmp
datalab  lib   media   proc			 sbin	     tools
