# Data Exploration and Visualisation. 

This notebook focuses on exploring and visualising the `trump_insults_tweets.csv` dataset. 
### Import all required packages

In [None]:
# Importing necessary libraries for data manipulation and visualisation
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from wordcloud import  WordCloud, STOPWORDS

### Import dataset into dataframe.

In [None]:
# define the directory where tthe dataset is located
DATADIR = f"{os.path.abspath(os.path.join(os.getcwd(), os.pardir))}/dataset"

# load the dataset into a pandas dataframe
main_df = pd.read_csv(f"{DATADIR}/trump_insults_tweets.csv")

### Exploring the Dataset

In [None]:
# display first 5 rows of the dataset
main_df.head()

In [None]:
# get a concise summary of the dataframe
main_df.info()

In [None]:
# generate descriptive statistics that summarise the central tendency
main_df.describe()

In [None]:
# print the number of rows and columns in the dataset 
print(f"{main_df.shape[0]} rows and {main_df.shape[1]} columns") 

In [None]:
# Check fofr missing values in each column
main_df.isnull().sum()

In [None]:
# display the data types of each column
main_df.dtypes

In [None]:
# check for duplicate rows in the dataset
main_df[main_df.duplicated()]

### Visualisations

In this section, various there will be visualisation to understand the data better. <br>
**Visualise the distribution of tweets over time**

In [None]:
# convert the date column to datetime format
main_df['date'] = pd.to_datetime(main_df['date'])

# plot the distribution of tweets over time
plt.figure(figsize=(12, 6))
sns.histplot(main_df['date'], bins=30, kde=True)
plt.title('Distribution of Tweets Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Tweets')
plt.show()

**Visualise the top 10 targets of insults.**

In [None]:
# count the occurance of each target
target_counts = main_df['target'].value_counts().head(10)

# plot the top 10 targets
plt.figure(figsize=(12, 6))
sns.barplot(x=target_counts.index, y=target_counts.values)
plt.title('Top 10 Targets of Insults')
plt.xlabel('Target')
plt.ylabel('Number of Tweets')
plt.xticks(rotation=90)
plt.show()

**Visalise the top 10 types of insults.**

In [None]:
# Count the occurance of each insult
insult_counts = main_df['insult'].value_counts().head(10)

# plot the top 10 types of insults
plt.figure(figsize=(12, 6))
sns.barplot(x=insult_counts.index, y=insult_counts.values)
plt.title('Top 10 Types of Insults')
plt.xlabel('Insult Type')
plt.ylabel('Number of Tweets')
plt.xticks(rotation=90)
plt.show()

**Visualise the first tweet**

In [None]:
# Start with the first review 
text = main_df['tweet'][0]

# Create and generate a word cloud image
wordcloud = WordCloud().generate(text)

# display the generated image 
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### Visualise all the stopwords with wordcloud

In [None]:
# Combining all tweets
all_tweets = " ".join(tweet for tweet in main_df.tweet)

# Create stopword list
stopwords = set(STOPWORDS)

# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(all_tweets)

# display the generated image 
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()