In [None]:
pip install tidytext textblob

In [None]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 200)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm

from Help_Funs import count_chars, count_words, count_capital_chars, count_capital_words, count_sent, count_unique_words, count_stopwords, count_hashtags 

s3 = boto3.resource('s3')
s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

## Defining files names
file_key_1 = 'NLP-Disaster-Tweets/train.csv'
file_key_2 = 'NLP-Disaster-Tweets/test.csv'
file_key_3 = 'NLP-Disaster-Tweets/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
sample = pd.read_csv(file_content_stream_3)

In [None]:
## Defining stopwords
stop_words = set(stopwords.words('english'))
others = set(["1", "2", "it'll", "ill", "=", '+', "'s'", '"'])
stop_words = stop_words.union(others)

def clean_tweet(tweet):
    if type(tweet) == np.float:
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split()
    temp = [w for w in temp if not w in stop_words]
    temp = " ".join(word for word in temp)
    return temp

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train['clean_tweet'] = np.nan

for i in tqdm(range(0, train.shape[0])):
    
    train['clean_tweet'][i] =  clean_tweet(train['text'][i])

In [None]:
test['clean_tweet'] = np.nan

for i in tqdm(range(0, test.shape[0])):
    
    test['clean_tweet'][i] =  clean_tweet(test['text'][i])

In [None]:
train['sentiment'] = train['clean_tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)
train['subjectivity'] = train['clean_tweet'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

test['sentiment'] = test['clean_tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)
test['subjectivity'] = test['clean_tweet'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [None]:
train['sentiment_label'] = np.where(train['sentiment'] < 0, 'negative', 
                                    np.where(train['sentiment'] == 0, 'neutral', 'positive'))

test['sentiment_label'] = np.where(test['sentiment'] < 0, 'negative', 
                                   np.where(test['sentiment'] == 0, 'neutral', 'positive'))

In [None]:
train_dummies = pd.get_dummies(train['sentiment_label'])
train = pd.concat([train.drop(columns = ['sentiment', 'sentiment_label'], axis = 1), train_dummies], axis = 1)

test_dummies = pd.get_dummies(test['sentiment_label'])
test = pd.concat([test.drop(columns = ['sentiment', 'sentiment_label'], axis = 1), test_dummies], axis = 1)

In [None]:
train['char_count'] = train['clean_tweet'].apply(lambda x: count_chars(x))
train['word_count'] = train['clean_tweet'].apply(lambda x: count_words(x))
train['unique_word_count'] = train['clean_tweet'].apply(lambda x: count_unique_words(x))
                                                 
test['char_count'] = test['clean_tweet'].apply(lambda x: count_chars(x))
test['word_count'] = test['clean_tweet'].apply(lambda x: count_words(x))
test['unique_word_count'] = test['clean_tweet'].apply(lambda x: count_unique_words(x))
                                                 
## Average word length
train['avg_wordlength'] = train['char_count'] / train['word_count']
test['avg_wordlength'] = test['char_count'] / test['word_count']

## Unique words vs count words
train['unique_vs_words'] = train['unique_word_count'] / train['word_count']
test['unique_vs_words'] = test['unique_word_count'] / test['word_count']

In [None]:
train.to_csv('train.csv', index = False)
test.to_csv('test.csv', index = False)

In [None]:
train['target'].value_counts() / train.shape[0]

In [None]:
pd.crosstab(train['target'], train['sentiment_label'])