## Read in files using read_csv

In [None]:
# Reading in the files using pandas read_csv
df1 = pd.read_csv('Renaissance23July-29July(02_50_07am_to_05_01_07am).csv')
df2 = pd.read_csv('Renaissance29July-01August(05_01_07am_to_22_04_49pm).csv')
df3 = pd.read_csv('Renaissance1August-3August(22_05_07pm_to_00_07_12pm).csv')

In [None]:
# merge the two dataframes as one
tweet_df = pd.concat([df1, df2, df3], ignore_index=True)

###  Data Assessment
Data was assessed here for data quality issues such as missing data, incorrect datatypes, duplicates e.t.c. and Data Tidiness issues.
While assessing for duplicates, tweet id is considered as the Primary key/ Unique identifier for all the dataframe

Column Descriptions

id - Unique id for each tweet
username - The twitter username of the tweeter
time_of_tweet - The time the tweet was tweeted
tweet - The content of the tweet
location - Location of Tweeter
retweets - The number of times the tweet has been retweeted
likes - The number of times the tweet has been liked
followers - The number of followers of the Tweeter
following - The number of followings of the Tweeter
verified - Whether the Tweeter is verified or not? True/False
tweet_source - The Source of Tweet

In [None]:
# Checking the number of rows and columns
tweet_df.shape

In [None]:
# Checking the first five rows of dataframe
tweet_df.head()

In [None]:
# Checking the last five rows of dataframe
tweet_df.tail()


In [None]:
# Checking random 5 rows of data
tweet_df.sample(5)

In [None]:
# Getting a concise summary of data
tweet_df.info()

In [None]:
# Checking for duplicates
tweet_df.duplicated(subset='id').sum()

In [None]:
# Viewing duplicates
pd.concat(g for _, g in tweet_df.groupby("id") if len(g) > 1)

In [None]:
# Checking for missing data or null
tweet_df.isnull().sum()

In [None]:
# Checking statistics of dataframe
tweet_df.describe()

In [None]:
# Checking the datatypes of each column
tweet_df.dtypes

## Data Cleaning
The dataframe is cleaned in this section. Erroneous datatypes were changed to correct datatypes (id was converted from integer to string because it is a unique identifier and not to be used for calculations), Missing values were handled by filling with 'No Location', duplicate entries were dropped, the dataframe was reduced to the correct timeframe as well.

Making a Copy of Data Before Cleaning

In [None]:
tweets_df = tweet_df.copy()

## Issue 1:
Define
Convert time_of_tweet to datetime and tweet id to string.

Code



In [None]:
# coverting to datetime using pandas to_datetime
tweets_df['time_of_tweet'] = tweets_df['time_of_tweet'].apply(pd.to_datetime)

In [None]:
# converting id to string
tweets_df['id'] = tweets_df['id'].astype(str)

### Test



In [None]:
# Checking the datatypes
tweets_df.dtypes

## Issue 2:
Define
Drop tweets before 2022-07-24 and after 2022-08-02

Code

Note, Twitter's time is UTC

In [None]:
# Specifying start date and end date
start_date = '2022-07-24 00:00:00+00:00'
end_date = '2022-08-03 00:00:00+00:00'

# Dropping tweets
mask = (tweets_df['time_of_tweet'] >= start_date) & (tweets_df['time_of_tweet'] < end_date)
tweets_df = tweets_df.loc[mask]

In [None]:
### Test

In [None]:
# Checking to see if the shape has been reduced since we dropped rows
tweets_df.shape

tweets_df.time_of_tweet.max()

tweets_df.time_of_tweet.min()

 COMMENT: Tweets are now ranging from 24th of July to 2nd of August. A span of 10 days

## Issue 3:
Define
Drop Unnamed: 0 column from dataframe

Code

In [None]:
#Dropping unnamed:0 column
tweets_df.drop(columns = ['Unnamed: 0'], axis=1, inplace=True)

In [None]:
# Reset index
tweets_df = tweets_df.reset_index(drop=True)

In [None]:
# checking to see if column has been dropped successfully
tweets_df.columns

In [None]:
tweets_df.tail()

## Issue 4:
Define
Replace Missing values in Location with 'No Location'

Code

In [None]:
# filling nulls in location column using fillna
tweets_df.location.fillna('No Location', inplace=True)

#### Test

In [None]:
# Checking for missing values/ nulls
tweets_df.isnull().sum()

## Issue 5:
Define
Drop Duplicates

Code


In [None]:
tweets_df.drop_duplicates(subset='id', inplace=True)

#### Test

In [None]:
# Checking for duplicates
tweets_df.duplicated(subset='id').sum()

####  COMMENT: I'm done with all cleaning so I'll save it to a csv file.

#### STORING CLEANED DATAFRAME TO A CSV FILE

In [None]:
tweets_df.to_csv('Renaissance_Cleaned.csv')

## Data Preprocessing
Data preprocessing involves all data cleaning in preparing tweets for Sentiment analysis. To do this, I created several functions which I applied to 'tweet' column in my dataframe to produce desired results. Properly preprocessing data results in more accurate downstream processes.


Also, for my Word Cloud, I wanted to show the words used to describe the album, so I created a function to extract some Positive Music words to a new column.

In [None]:
# Defining a function to extract hashtags with REGEX(Regular Expressions)
def getHashtags(tweet):
    tweet = tweet.lower()  #converts tweet to lower case
    tweet = re.findall(r'\#\w+',tweet)  
    return " ".join(tweet)

In [None]:
# Getting Hashtags and storing in column 'Hashtags'
tweets_df['hashtags'] = tweets_df['tweet'].apply(getHashtags)
tweets_df.head()

In [None]:
# Store hashtags in a list
hashtags_list = tweets_df['hashtags'].tolist()

# Iterate over all hashtags and split where there is more than one hashtag per row of data
hashtags = []
for item in hashtags_list:
    item = item.split()
    for i in item:
        hashtags.append(i)

# Importing Collection module to determine unique count of all hashtags used
from collections import Counter

# Determine Unique count of all hashtags used
counts = Counter(hashtags)
hashtags_df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
hashtags_df.columns = ['hashtags', 'count']
hashtags_df.sort_values(by='count', ascending=False, inplace=True)
print("The Total Number of Unique Hashtags is: ", hashtags_df.shape[0])

In [None]:
# Checking the hashtag dataframe for the top 10 hashtags used
hashtags_df.head(10)

In [None]:
# Saving hashtags dataframe to a csv file
hashtags_df.to_csv('Ren_Hashtags.csv')

In [None]:
# Defining a function
def getTweetsLower(tweet):
    tweet = tweet.lower()  #converts tweet to lower case
    return tweet

In [None]:
# Get Tweets in lower case and store as tweet_lowercase
tweets_df['tweet_lowercase'] = tweets_df['tweet'].apply(getTweetsLower)
tweets_df.head()

In [None]:
# Creating a List containing Renaissance Track
renaissance_tracks = ["i'mthatgirl", "cozy", "aliensuperstar", "cuffit", "energy", "breakmysoul", "churchgirl", 
                     "plasticoffthesofa", "virgo'sgroove", "move", "heated", "thique", "allupinyourmind",
                      "americahasaproblem", "pure/honey", "summerrenaissance"]


In [None]:
# Define function to replace track names as one word track name in a new column tweet_track
def trackNames(ren_track):
    replacements = [("plastic off the sofa","plasticoffthesofa"), ("i'm that girl", "i'mthatgirl"), 
                    ("im that girl", "i'mthatgirl"),("alien superstar", "aliensuperstar"), ("cuff it", "cuffit"), 
                    ("break my soul", "breakmysoul"), ("church girl", "churchgirl"), ("virgo's groove", "virgo'sgroove"), 
                    ("virgo groove", "virgo'sgroove"), ("virgos groove", "virgo'sgroove"), 
                    ("all up in your mind", "allupinyourmind"), ("america has a problem", "americahasaproblem"), 
                    ("summer renaissance", "summerrenaissance")]

    for pat,repl in replacements:
        ren_track = re.sub(pat, repl, ren_track)
    return ren_track
tweets_df['tweet_track'] = tweets_df['tweet_lowercase'].apply(trackNames)
tweets_df.head()                      

In [None]:
# Define function to extract Renaissance Track from each tweet_track
def getRenaissanceTrack(tweet_track):
    tweet_track = tweet_track.lower() #Reduces tweet to lower case
    tweet_tokens = word_tokenize(tweet_track) #splits each word in tweet_track for parsing
    ren_track = [char for char in tweet_tokens if char in renaissance_tracks] 
    return " ".join(ren_track)


In [None]:
# Extract tracks to a new column
tweets_df['track'] = tweets_df['tweet_track'].apply(getRenaissanceTrack)
tweets_df.head()

In [None]:
# Store track in a list
track_list = tweets_df['track'].tolist()

# Iterate over all track names and split where there is more than one track
track = []
for item in track_list:
    item = item.split()
    for i in item:
        track.append(i)

# Determine Unique count of all track
counts = Counter(track)
track_df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
track_df.columns = ['Ren_track', 'Count']
track_df.sort_values(by='Count', ascending=False, inplace=True)
print("The Total Number of Unique Tracks is: ", track_df.shape[0])
track_df

In [None]:
# Saving Track df to csv file
track_df.to_csv('Ren_Track.csv')

In [None]:
# Creating a list of Positive words about the album
renaissance_positive_words = ["noskips", "noshuffles", "vocals", "lyrics", "beats", "production", 
                     "samples", "harmonies"]

In [None]:
# Define function to replace Positive words as one word Positive words in a new column tweet_positive_words
def positiveWords(ren_positive_words):
    replacements = [("no skips","noskips"), ("zero skips", "noskips"), ("0 skips", "noskips"), ("no shuffle", "noshuffles"), 
                    ("no shuffles", "noshuffles")]
    for pat,repl in replacements:
        ren_positive_words = re.sub(pat, repl, ren_positive_words)
    return ren_positive_words
tweets_df['tweet_positive_words'] = tweets_df['tweet_lowercase'].apply(positiveWords)
tweets_df.head()

In [None]:
# Define function to extract Positive words from each Tweet
def getPositiveWord(tweet_positive_words):
    tweet_positive_words = tweet_positive_words.lower() #Reduces tweet to lower case
    tweet_tokens = word_tokenize(tweet_positive_words) #splits each word in tweet_track for parsing
    ren_positive_words = [char for char in tweet_tokens if char in renaissance_positive_words] 
    return " ".join(ren_positive_words)

In [None]:
# Extract Positive words to a new column
tweets_df['positive_words'] = tweets_df['tweet_positive_words'].apply(getPositiveWord)
tweets_df.head()

In [None]:
# Store positive words in a list
pos_list = tweets_df['positive_words'].tolist()

# Iterate over all cast names and split where there is more than one cast
pos = []
for item in pos_list:
    item = item.split()
    for i in item:
        pos.append(i)

# Determine Unique count of all cast
counts = Counter(pos)
positive_words_df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
positive_words_df.columns = ['Positive_Words', 'Count']
positive_words_df.sort_values(by='Count', ascending=False, inplace=True)
positive_words_df

In [None]:
# saving positive words dataframe to a csv file
positive_words_df.to_csv('Ren_Positive_Words.csv')

In [None]:
import string

# Defining my NLTK stop words and my user-defined stop words
stop_words = list(stopwords.words('english'))
user_stop_words = ["i", "i'm", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", 
                   "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
                   "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", 
                   "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", 
                   "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "i'll","we'll","they'll","you'll","she'll","he'll","'ll","n't",
                   "'s",'anyone','today','yesterday','day', 'already', 'year', 'many', 'much', 'amp', 'next', 'cant', 'wont', 'hadnt','havent', 'hasnt', 
                   'isnt', 'shouldnt', "didn't", "couldn't", 'wasnt', 'werent','mustnt', 
                   'been…','aht', 've', 'next',"all", "any", "both", "each", 'by',
                  'year',]

# The list below are common words which will not be relevant in our analysis.
common_words = ['renaissance', 'beyonce', 'bey', 'rennaissance', 'album', "beyonce's", "beehive", "transitions"]
alphabets = list(string.ascii_lowercase)
stop_words = stop_words + user_stop_words + alphabets + common_words + renaissance_tracks

In [None]:
emojis = list(UNICODE_EMOJI.keys())

# preProcess tweet for sentiment analysis
def preprocessTweets(tweet):
    tweet = tweet.lower()
    # Cleaning and removing URL’s
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags = re.MULTILINE)
    # Cleaning and removing repeating characters
    tweet = re.sub(r'\@\w+|\#\w+|\d+', '',  tweet)
    # Cleaning and removing the above stop words list from the tweet text
    tweet_tokens = word_tokenize(tweet)  
    filtered_words = [w for w in tweet_tokens if w not in stop_words]
    filtered_words = [w for w in filtered_words if w not in emojis]
    # Cleaning and removing punctuations
    unpunctuated_words = [w for w in filtered_words if w not in string.punctuation]
    lemmatizer = WordNetLemmatizer() 
    lemma_words = [lemmatizer.lemmatize(w) for w in unpunctuated_words]
    return " ".join(lemma_words)

In [None]:
# Generate a new column called 'Processed Tweets' by applying preprocessed tweets function to the 'Tweet' column.
tweets_df['Processed_Tweets'] = tweets_df['tweet'].apply(preprocessTweets)
tweets_df.head()

In [None]:
# Extract all tweets into one long string with each word separate with a "space"
tweets_long_string = tweets_df['Processed_Tweets'].tolist()
tweets_long_string = " ".join(tweets_long_string)

## Sentiment Analysis
In this section, I want to show the sentiments in relation to the Renaissance Album. I'm going to employ the use of Text Blob

In [None]:
import textblob
from textblob import TextBlob

In [None]:
# Define function to obtain Polarity Score
def getPolarity(tweet):
    return TextBlob(tweet).sentiment.polarity

# Define function to obtain Sentiment category
def getSentimentTextBlob(polarity):
    if polarity < 0:
        return "Negative"
    else:
        return "Positive"

In [None]:
# Apply the functions to respective columns
tweets_df['Polarity']=tweets_df['Processed_Tweets'].apply(getPolarity)
tweets_df['Sentiment']=tweets_df['Polarity'].apply(getSentim

In [None]:
tweets_df.head()

In [None]:
# get unique values and count in sentiment column
tweets_df['Sentiment'].value_counts()

## Data Visualization


In [None]:
# for opening, manipulating, and saving image file
from PIL import Image

In [None]:
data = tweets_df['Sentiment'].value_counts()
data

In [None]:
# Plotting a pie chart to show distribution of Sentiments
plt.figure(figsize=[15,10], facecolor='none')
plt.pie(data, labels=['Positive', 'Negative'], colors=['#FEE715', '#666666'], startangle=90, explode= [0.05, 0.05], autopct='%1.1f%%');
plt.title('Twitter Users Sentiments');

In [None]:
# Save to dataframe
plt.savefig("Sentiments.png", format="png")

#### Next, Plotting Tracks to show rank

In [None]:
track_df_sort = track_df.sort_values('Count', ascending='True')

plt.style.use("seaborn-whitegrid")

ax = track_df_sort.plot.barh(x='Ren_track', y='Count', figsize=(10,15), legend=None, width=0.6, color=['#666666', '#666666','#666666','#666666','#666666','#666666','#666666','#666666','#666666','#666666','#666666','#666666','#666666','#666666', '#666666', '#FEE715'])
plt.title('Ranking Of Renaissance Tracks');
plt.grid(False)
plt.ylabel('Tracks')
plt.xlabel('Count')
ax.set_facecolor('white')

plt.rcParams['axes.facecolor'] = 'white'


for y, x in enumerate(track_df_sort.Count):
    ax.annotate("{:,}".format(x), xy=(x, y))
    ax.set_xlim(0, track_df_sort.Count.max()*1.2)

plt.show()

In [None]:
# Save to dataframe
plt.savefig("Tracks.png", format="png");

#### Next, Generating WordClouds

In [None]:
# Create function to generate the blue colour for the Word CLoud
def yellow_color_func(word, font_size, position, orientation, random_state=None,**kwargs):
    return "hsl(54, 99%%, %d%%)" % random.randint(50, 80)


In [None]:
import random
# Import Horse Logo
image = np.array(Image.open('RenaissanceHorse.png'))


In [None]:
 # Instantiate the figure object
plt.figure(figsize=[15,10], facecolor='none')


plt.imshow(image, cmap=plt.cm.gray, interpolation='bilinear') # Display data as an image
plt.axis('off') # Remove axis
plt.show() # Display image

In [None]:
# Instantiate the Twitter word cloud object
twitter_wc = WordCloud(background_color='#212121', mask=image)

# generate the word cloud
twitter_wc.generate(tweets_long_string)

# display the word cloud
plt.figure(figsize=[15,10], facecolor='none')


plt.imshow(twitter_wc.recolor(color_func = yellow_color_func, random_state=3),
           interpolation="bilinear")
plt.axis('off')
plt.title('Word Cloud Representation of Tweets');
plt.show()

In [None]:
# Save to dataframe
plt.savefig("RenaissanceWordCloud.png", format="png")

### Save the Dataframe to be exported to Microsoft Power BI to Create a Dashboard


In [None]:
tweets_df.to_csv("Renaissance_Final_File.csv", index=False)