In [2]:
import numpy as np 
import pandas as pd
from sklearn import model_selection
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/veraachiaa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/veraachiaa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
train_data = pd.read_csv("Datasets/train.csv")
test_data = pd.read_csv("Datasets/test.csv")

tweets = train_data
tweets.head()



In [None]:
tweets["keyword"] = tweets["keyword"].fillna("")
tweets["tweet"] = tweets["keyword"] + " " + tweets["text"]
tweets.sample(5, random_state=42)
    
tweets["tweet_lower"] = tweets["tweet"].str.lower()
tweets["tweet_lower"].sample(5, random_state=42)


Remove html

In [None]:
text = r"&gt;&gt; $15 Aftershock : Protect Yourself and Profit in the Next Global Financial... ##book http://t.co/f6ntUc734Z esquireattire"
soup = BeautifulSoup(text)
soup.get_text()
def remove_html(text):
    soup = BeautifulSoup(text)
    text = soup.get_text()
    return text
tweets["tweet_noHTML"] = tweets["tweet_lower"].apply(remove_html)
tweets["tweet_noHTML"].sample(5, random_state=42)

Expand Contractions

There are many contractions of words used in informal communication such as can't: can not, they've: they have or even modern contractions such as sux: sucks. In many cases, these contractions are considered as stopwords and are removed. There is a python package to expand such contractions conveniently named as contractions, which has collection of most of such contractions and can be used for expanding them as a preprocessing step


In [None]:
!pip install contractions
import contractions

tweets["tweet_noContractions"] = tweets["tweet_noHTML"].apply(contractions.fix)
tweets["tweet_noContractions"].sample(5, random_state=42)

Remove urls

In [None]:
def remove_urls(text):
    pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)(/\w*)?')
    text = re.sub(pattern, "", text)
    return text
tweets["tweet_noURLs"] = tweets["tweet_noContractions"].apply(remove_urls)
tweets["tweet_noURLs"].sample(5, random_state=42)

Remove E-mail IDs

In [None]:
def remove_emails(text):
    pattern = re.compile(r"[\w\.-]+@[\w\.-]+\.\w+")
    text = re.sub(pattern, "", text)
    return text
tweets["tweet_noEmail"] = tweets["tweet_noURLs"].apply(remove_emails)
tweets["tweet_noEmail"].sample(5, random_state=42)

Remove tweet mention

In [None]:
def remove_mentions(text):
    pattern = re.compile(r"@\w+")
    text = re.sub(pattern, "", text)
    return text
tweets["tweet_noMention"] = tweets["tweet_noEmail"].apply(remove_mentions)
tweets["tweet_noMention"].sample(5, random_state=42)

Handling Emojis

In [None]:
!pip install unidecode
from unidecode import unidecode
def handle_accents(text):
    text = unidecode(text)
    return text
tweets["tweet_handleAccents"] = tweets["tweet_noMention"].apply(handle_accents)
tweets["tweet_handleAccents"].sample(5, random_state=42)
def remove_unicode_chars(text):
    text = text.encode("ascii", "ignore").decode()
    return text
#As mentioned before, the accented characters are removed by this step
tweets["tweet_noUnicode"] = tweets["tweet_noMention"].apply(remove_unicode_chars)
tweets["tweet_noUnicode"].sample(5, random_state=42)

In [None]:
tweets.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Deeds Reason May ALLAH Forgive us,1
1,4,,,Forest fire near La Ronge Sask . Canada,1
2,5,,,residents asked 'shelter place ' notified offi...,1
3,6,,,"13,000 people receive evacuation orders Califo...",1
4,7,,,got sent photo Ruby smoke pours school,1


In [None]:
# Taining and Testing
X = tweets['text']
y = tweets['target']

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

# Using TF-IDF vectorizer to convert text into numerical features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

#Logistics Regression
model = LogisticRegression()

# Train the model
model.fit(X_train_tfidf, y_train)

X_test = test_data['text']
X_test_tfidf = vectorizer.transform(X_test)

# Making predictions on the test set
predictions = model.predict(X_test_tfidf)

test_data['prediction'] = predictions


# Print the predictions
print(test_data[['text', 'prediction']])
test_data.to_csv('test_predictions.csv', index=False)