# Data Exploration and Visualisation. 

### Import all required packages

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

### Import dataset into dataframe.

In [None]:
DATADIR = f"{os.path.abspath(os.path.join(os.getcwd(), os.pardir))}/dataset"

main_df = pd.read_csv(f"{DATADIR}/trump_insults_tweets.csv")

### Exploring the Dataset

In [None]:
main_df.head()

In [None]:
main_df.info()

In [None]:
main_df.describe()

In [None]:
print(f"{main_df.shape[0]} rows and {main_df.shape[1]} columns") 

In [None]:
main_df.isnull().sum()

In [None]:
main_df.dtypes

In [None]:
main_df[main_df.duplicated()]

### Visualisations

Visualise the distribution of tweets over time

In [None]:
main_df['date'] = pd.to_datetime(main_df['date'])

plt.figure(figsize=(12, 6))
sns.histplot(main_df['date'], bins=30, kde=True)
plt.title('Distribution of Tweets Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Tweets')
plt.show()

Visualise the top 10 targets of insults.

In [None]:
# count the occurance of each target
target_counts = main_df['target'].value_counts().head(10)

# plot the top 10 targets
plt.figure(figsize=(12, 6))
sns.barplot(x=target_counts.index, y=target_counts.values)
plt.title('Top 10 Targets of Insults')
plt.xlabel('Target')
plt.ylabel('Number of Tweets')
plt.xticks(rotation=90)
plt.show()

Visalise the top 10 types of insults.

In [None]:
# Count the occurance of each insult
insult_counts = main_df['insult'].value_counts().head(10)

# plot the top 10 types of insults
plt.figure(figsize=(12, 6))
sns.barplot(x=insult_counts.index, y=insult_counts.values)
plt.title('Top 10 Types of Insults')
plt.xlabel('Insult Type')
plt.ylabel('Number of Tweets')
plt.xticks(rotation=90)
plt.show()