# Cleaning some data
## Getting our hands dirty!! :)

In [None]:
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk import TreebankWordTokenizer, SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import string
import warnings
from pandas.core.common import SettingWithCopyWarning

### Collect some data

In [None]:
data = pd.read_csv("train.csv", index_col = "tweetid")
data.head()
#change index to tweetid
#g = pd.read_csv("test_with_no_labels.csv")
#g.head()

## let the washing begin!!

In [None]:
#rearrange the columns for clarity
data_sorted = data[["message", "sentiment"]]
data_sorted.head()



In [None]:
#remove web urls 
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
data_sorted['message'] = data_sorted['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)



In [None]:
#make everything lower case
data_sorted['message'] = data_sorted['message'].str.lower()

data_sorted

In [None]:
#remove all punctuations

def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

data_sorted['message'] = data_sorted['message'].apply(remove_punctuation)
data_sorted

In [None]:
#tokenisation to treat each word with respect
tokeniser = TreebankWordTokenizer()
data_sorted['message'] = data_sorted['message'].apply(tokeniser.tokenize)

data_sorted

In [None]:
#stem to transform to the root word, and have more common words
stemmer = SnowballStemmer('english')

def mbti_stemmer(words, stemmer):
    return [stemmer.stem(word) for word in words]

data_sorted['message'] = data_sorted['message'].apply(mbti_stemmer, args=(stemmer, ))

data_sorted

In [None]:
#lemmatization to lessen unique words even more by changing tenses of words and plurals
lemmatizer = WordNetLemmatizer()

def mbti_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]

data_sorted['message'] = data_sorted['message'].apply(mbti_lemma, args=(lemmatizer, ))

data_sorted

In [None]:
#remove standard stop words, which are words of insignificance
def remove_stop_words(tokens):    
    return [t for t in tokens if t not in stopwords.words('english')]

data_sorted['message'] = data_sorted['message'].apply(remove_stop_words)

data_sorted

# exploratory data analysis

In [None]:
#see the min, max and mode(from dictionary) category of the sentiments
print("min:", np.min(data_sorted["sentiment"]))
print("max:", np.max(data_sorted["sentiment"]))

dict_ = {}

for i in data_sorted["sentiment"]:
    if i in dict_:
        dict_[i] += 1
    else:
        dict_[i] = 1

print("total number of sentiments:", len(data_sorted))
print("counts of the respective sentiments:", dict_)


In [None]:
#the skewness and kurtosis(lack of ourtliers) of the sentiments
print("skewness: ", data_sorted["sentiment"].skew())
print(" kurtosis: ",data_sorted["sentiment"].kurtosis())

In [None]:
#the frequency distribution of the sentiments
#Thibello
sns.countplot(x = 'sentiment', data = data_sorted, palette="hls")
plt.title("Distribution of Sentiments")

# Streamlit

Streamlit will enable the visualisation and user interfaces for the Twitter Sentiment Predict.

In [None]:
# JS
# Example code we may use for Streamlit

# import Streamlit (ensure all libraries and dependencies downloaded for environment)
import streamlit as st
import altair as alt

# slider functionality
# x = st.slider('x')
# st.write(x, 'squared is', x * x)

# reuse data across runs (this would be the final clean data we have)
# read_and_cashe_csv = st.cache(pd.read_csv)

# create title on webpage
# st.write('# Streamlit Twitter Sentiment App')