## Import libraries

In [18]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from gensim.models import Word2Vec

## Read tweets

In [19]:
df = pd.read_csv(r'../assets/data/tweets.csv')

tweets = df['text']

tweets

0              RIP #TomVerlaine https://t.co/w40iGeqmKK
1     Reposted from @vintage.cheese #shesjustdrawnth...
2     Reposted from @thehorrorgallery 😂🤣 🔥 \n\nFollo...
3     Reposted from @jess_rene_ #art #witch #witchcr...
4     Reposted from @bjanetrose 🥀 @kostle_laugh_with...
                            ...                        
95    Reposted from @thelegendsofmusic The White Str...
96    Reposted from @paulrosolie  In the last two we...
97    Reposted from @bjanetrose 🥀 #wonderwoman 🥀  ii...
98    Reposted from dropkickblues Chuck Berry, what ...
99    Hell of a crowd in Wellington, NZ tonight, you...
Name: text, Length: 100, dtype: object

## Pre-process tweets

In [20]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

tweetList = []
tweetSentences = []

stopWords = set(stopwords.words('english'))

for i, tweet in enumerate(tweets):
    # Convert the tweet to lowercase using REGEX
    for f in re.findall("([A-Z]+)", tweet):
        tweet = tweet.replace(f, f.lower())

    # Removing special characters and replacing them with a space
    tweet = re.sub("[^A-Za-z0-9]", " ", tweet, 0, re.IGNORECASE)
    
    # From a single sentence, store all the words 
    wordsInTweet = nltk.word_tokenize(tweet)
    
    # Filter out all the stop words
    wordsInTweet = [word for word in wordsInTweet if word not in stopWords]
    
    # Stem each of the tweets
    wordsInTweet = [stemmer.stem(word) for word in wordsInTweet if word not in stopWords]
    
    # Lemmatize each of the tweets
    wordsInTweet = [lemmatizer.lemmatize(word) for word in wordsInTweet]
    
    # Append them to a list
    tweetList.append(wordsInTweet)
    
    # Remake the sentences after removing the stop words
    tweetSentences.append(' '.join(wordsInTweet))

In [21]:
tweetList

[['rip', 'tomverlain', 'http', 'co', 'w40igeqmkk'],
 ['repost',
  'vintag',
  'chees',
  'shesjustdrawnthatway',
  'terrygilliam',
  'montypython',
  'legend',
  '1960',
  '1970',
  'vintagechees',
  'iiii',
  'http',
  'co',
  '2hpfzj1jvm'],
 ['repost',
  'thehorrorgalleri',
  'follow',
  'u',
  'thehorrorgalleri',
  'peligrosovalley',
  'iiii',
  'http',
  'co',
  'xkpm5ruwp3'],
 ['repost',
  'jess',
  'rene',
  'art',
  'witch',
  'witchcraft',
  'instaart',
  'macabr',
  'macabreart',
  'artist',
  'darkart',
  'dark',
  'instagood',
  'salem',
  'joseph',
  'tomanek',
  'philip',
  'hofmann',
  'iiii',
  'http',
  'co',
  'wvuuumxzll'],
 ['repost',
  'bjanetros',
  'kostl',
  'laugh',
  'dead',
  'iiii',
  'http',
  'co',
  'cgv2hmyl1i'],
 ['post', 'photo', 'http', 'co', 'vjjloc4scg'],
 ['post', 'photo', 'http', 'co', '2y6cfktrvw'],
 ['post', 'photo', 'http', 'co', 'a8nkjpfqho'],
 ['repost',
  'bizarredoctor',
  'season',
  'witch',
  'sound',
  'artist',
  'minelauvart',
  'iiii'

In [22]:
tweetSentences

['rip tomverlain http co w40igeqmkk',
 'repost vintag chees shesjustdrawnthatway terrygilliam montypython legend 1960 1970 vintagechees iiii http co 2hpfzj1jvm',
 'repost thehorrorgalleri follow u thehorrorgalleri peligrosovalley iiii http co xkpm5ruwp3',
 'repost jess rene art witch witchcraft instaart macabr macabreart artist darkart dark instagood salem joseph tomanek philip hofmann iiii http co wvuuumxzll',
 'repost bjanetros kostl laugh dead iiii http co cgv2hmyl1i',
 'post photo http co vjjloc4scg',
 'post photo http co 2y6cfktrvw',
 'post photo http co a8nkjpfqho',
 'repost bizarredoctor season witch sound artist minelauvart iiii http co jspww6de3k',
 'happi birthday steveadl iiii http co q2hjtdrmsa',
 'repost patfromstoke1 repost fakedream ai iiii http co spfo4fhakc',
 'repost bjanetros artist trinitycatdelin iiii http co 9lgrwvg1nj',
 'repost jayprehistoricpet one absolut giant komodo dragon seem like live dinosaur run laura iiii http co qc9jsxubif',
 'repost thelegendsofmus c