# Rumour Analysis with Covid Data

## Inference

In [None]:
import load_data

In [None]:
covid_df = load_data.load_data(data_file = '../data/covid.data.jsonl', label_file = None)

## Exploratory Data Analysis

Similiar to https://www.kaggle.com/gunesevitan/nlp-with-disaster-tweets-eda-cleaning-and-bert

In [9]:
def load_data_for_analysis(data_file, label_file):
    
    if label_file != None:
        y_true = json.load(open(label_file))
    
    with open(data_file, 'r') as data_train:
        raw_list = list(data_train)

    data_list = []


    for event in raw_list:
        tweets_in_event = json.loads(event)

        tweet = {}
        tweet['id'] = tweets_in_event[0]['id']
        tweet['text'] =  tweets_in_event[0]['text']

        if label_file != None:
            tweet['label'] = convert_label(y_true[str(tweet['id'])])
        
        data_list.append(tweet)

    df = pd.DataFrame(data_list)

    return df



In [None]:
covid_df = load_data.load_data(data_file = '../data/covid.data.jsonl', label_file = None)

In [None]:
covid_df

In [None]:
# word_count
covid_df['word_count'] = covid_df['text'].apply(lambda x: len(str(x).split()))

# unique_word_count
covid_df['unique_word_count'] = covid_df['text'].apply(lambda x: len(set(str(x).split())))

# stop_word_count
covid_df['stop_word_count'] = covid_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# url_count
covid_df['url_count'] = covid_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))

# mean_word_length
covid_df['mean_word_length'] = covid_df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# char_count
covid_df['char_count'] = covid_df['text'].apply(lambda x: len(str(x)))

# punctuation_count
covid_df['punctuation_count'] = covid_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

# hashtag_count
covid_df['hashtag_count'] = covid_df['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

# mention_count
covid_df['mention_count'] = covid_df['text'].apply(lambda x: len([c for c in str(x) if c == '@']))


In [None]:
METAFEATURES = ['word_count', 'unique_word_count', 'stop_word_count', 'url_count', 'mean_word_length',
                'char_count', 'punctuation_count', 'hashtag_count', 'mention_count']
DISASTER_TWEETS = df_train['target'] == 1

fig, axes = plt.subplots(ncols=2, nrows=len(METAFEATURES), figsize=(20, 50), dpi=100)

for i, feature in enumerate(METAFEATURES):
    sns.distplot(df_train.loc[~DISASTER_TWEETS][feature], label='Not Disaster', ax=axes[i][0], color='green')
    sns.distplot(df_train.loc[DISASTER_TWEETS][feature], label='Disaster', ax=axes[i][0], color='red')

    sns.distplot(df_train[feature], label='Training', ax=axes[i][1])
    sns.distplot(df_test[feature], label='Test', ax=axes[i][1])
    
    for j in range(2):
        axes[i][j].set_xlabel('')
        axes[i][j].tick_params(axis='x', labelsize=12)
        axes[i][j].tick_params(axis='y', labelsize=12)
        axes[i][j].legend()
    
    axes[i][0].set_title(f'{feature} Target Distribution in Training Set', fontsize=13)
    axes[i][1].set_title(f'{feature} Training & Test Set Distribution', fontsize=13)

plt.show()