# 🏸Thomas Cup Badminton 2022 - Tweets Analysis

I collected the Tweets on the Thomas Cup Badminton 2022 as a dataset and performed EDA. <br>
Check out the dataset [here](https://www.kaggle.com/datasets/tejasurya/thomas-cup-2022-badminton-tweets)

**Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

%matplotlib inline 
from wordcloud import WordCloud, STOPWORDS
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import warnings
warnings.filterwarnings("ignore")

**Reading dataset**

In [None]:
tweets_data = pd.read_parquet('../input/thomas-cup-2022-badminton-tweets/thomascup_2022_tweets.parquet')
print(tweets_data.head())

**Shape**

In [None]:
tweets_data.shape

**Columns**

In [None]:
tweets_data.columns

In [None]:
tweets_data.info()

**DType conversion**

In [None]:
tweets_data['text'] = tweets_data['text'].astype('string')
tweets_data['username'] = tweets_data['username'].astype('string')
tweets_data['hashtags'] = tweets_data['hashtags'].astype('string')
tweets_data['created_at'] = pd.to_datetime(tweets_data['created_at'])
tweets_data['language'] = tweets_data['language'].astype('string')
tweets_data['quotedtweet'] = tweets_data['quotedtweet'].astype('string')
tweets_data['inReplyToUser'] = tweets_data['inReplyToUser'].astype('string')
tweets_data['mentionedUsers'] = tweets_data['text'].astype('string')

In [None]:
tweets_data.dtypes

In [None]:
tweets_data.describe().T

In [None]:
tweets_data.drop(columns=['quotedtweet'],inplace=True)

### **Missing values**

In [None]:
def missing_data(data):
    total_count = tweets_data.isnull().sum()
    percentage = (tweets_data.isnull().sum()/tweets_data.isnull().count()*100)
    concat = pd.concat([total_count, percentage], axis=1, keys=['Total', 'Percentage'])
    types = []
    for col in tweets_data.columns:
        dtype = str(tweets_data[col].dtype)
        types.append(dtype)
    concat['Types'] = types
    return(np.transpose(concat))

In [None]:
missing_data(tweets_data)

# EDA

## Sentiment analysis
### With nltk SentimentIntensityAnalyzer

In [None]:
sia = SentimentIntensityAnalyzer()
def get_sentiment(post):
    if sia.polarity_scores(post)["compound"] > 0:
        return "Positive"
    elif sia.polarity_scores(post)["compound"] < 0:
        return "Negative"
    else:
        return "Neutral"

In [None]:
tweets_data['text_sentiment'] = tweets_data['text'].apply(lambda x: get_sentiment(x))

In [None]:
cols = tweets_data.columns

## Data Exploration

**Most frequent words**

In [None]:
def frequent_values(data):
    total = data.count()
    temp_df = pd.DataFrame({'total':total})
    items,values = [],[]
    for col in cols:
        item = data[col].value_counts().index[0]
        value = data[col].value_counts().values[0]
        items.append(item)
        values.append(value)
    temp_df['Most frequent item'] = items
    temp_df['Count'] = values
    temp_df['Percent from total'] = np.round(values / total * 100, 3)
    return temp_df.T

In [None]:
frequent_values(tweets_data)

**Observation**<br>
* "#thomascup" is the most frequent words used in the tweets.
* There are 5126 tweets with positive sentiment.

**Username**

In [None]:
def plot_count(feature, title, df, size=1, ordered=True):
    fig, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    if ordered:
        fig1 = sns.countplot(df[feature], order = df[feature].value_counts().index[:20], palette='Set2')
    else:
        fig1 = sns.countplot(df[feature], palette='Set2')
    fig1.set_title("Number and percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,height,'{:1.2f}%'.format(100*height/total),ha="center") 
    plt.show()  

In [None]:
plot_count("username", "User name", tweets_data,4)

**Observation**<br>
* Mellikka is the user that tweeted most number of times @ 125 on this topic.
* Followed by the users BadmintonTalk, porsianaADD etc.

## Data Visualisation

**Wordcloud**

In [None]:
def show_wordcloud(data, title=""):
    text = " ".join(t for t in data.dropna())
    stopwords = set(STOPWORDS)
    stopwords.update(["t", "co", "https", "amp", "U", "Badminton" "India", "Thomascup", "Thomas"])
    wordcloud = WordCloud(stopwords=stopwords, scale=4, max_font_size=50, max_words=500,background_color="black").generate(text)
    fig = plt.figure(1, figsize=(16,16))
    plt.axis('off')
    fig.suptitle(title, fontsize=20)
    fig.subplots_adjust(top=2.3)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.show()

## Title

In [None]:
show_wordcloud(tweets_data['text'], title = 'Most used words in text')

In [None]:
def plot_sentiment(df, feature, title):
    counts = df[feature].value_counts()
    percent = counts/sum(counts)

    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

    counts.plot(kind='bar', ax=ax1, color='green')
    percent.plot(kind='bar', ax=ax2, color='blue')
    ax1.set_ylabel(f'Counts : {title} sentiments', size=14)
    ax2.set_ylabel(f'Percentage : {title} sentiments', size=14)
    plt.suptitle(f"Sentiment analysis: {title}")
    plt.tight_layout()
    plt.show()

In [None]:
plot_sentiment(tweets_data, 'text_sentiment', 'Text')

**Observation**
* 52.82% positive tweets in the Thomas cup 2022 badminton dataset. 

In [None]:
show_wordcloud(tweets_data.loc[tweets_data['text_sentiment']=='Positive', 'text'], title = 'Most used words in texts (Positive sentiment)')

In [None]:
show_wordcloud(tweets_data.loc[tweets_data['text_sentiment']=='Negative', 'text'], title = 'Most used words in texts (Negative sentiment)')

In [None]:
show_wordcloud(tweets_data.loc[tweets_data['text_sentiment']=='Neutral', 'text'], title = 'Most used words in texts (Neutral sentiment)')

## With TextBlob

In [None]:
def get_sentiment_polarity_textblob(post):
    blob = TextBlob(post)
    polarity = 0
    for sentence in blob.sentences:
        polarity += sentence.sentiment.polarity
    return polarity

def get_sentiment_subjectivity_textblob(post):
    blob = TextBlob(post)
    subjectivity = 0
    for sentence in blob.sentences:
        subjectivity += sentence.sentiment.subjectivity
    return subjectivity

In [None]:
tweets_data['text_sentiment_polarity'] = tweets_data['text'].apply(lambda x: get_sentiment_polarity_textblob(x))
tweets_data['text_sentiment_subjectivity'] = tweets_data['text'].apply(lambda x: get_sentiment_subjectivity_textblob(x))

In [None]:
def plot_sentiment_textblob(df, feature, title):
    polarity = df[feature+'_sentiment_polarity']
    subjectivity = df[feature+'_sentiment_subjectivity']

    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

    polarity.plot(kind='kde', ax=ax1, color='blue')
    subjectivity.plot(kind='kde', ax=ax2, color='green')
    ax1.set_ylabel(f'Sentiment polarity : {title}', size=14)
    ax2.set_ylabel(f'Sentiment subjectivity: {title}', size=14)
    plt.suptitle(f"Sentiment analysis (polarity & subjectivity): {title}")
    plt.tight_layout()
    plt.show()

In [None]:
plot_sentiment_textblob(tweets_data, "text", 'Text')

If you liked the Thomas Cup 2022 tweets dataset, please do upvote it. Feedbacks and Suggestions are welcomed. Thanks a lot! - `@tejasurya`