# 1. Preliminary

## 1.1 Context

* We will analyze a very well known NLP dataset: tweets from disaster


* It is a Kaggle competition, which offers a simple but good level textual dataset to be able to make its weapons in NLP


* The dataset is here [https://www.kaggle.com/competitions/nlp-getting-started/data]


* Please use the **train** dataset


* In this 1st part we are just play and discover with this dataset

## 1.2 Requirements

You have to install  : 

* pandas
* numpy
* matplotlib
* seaborn

## 1.3 Imports

In [None]:
import os, sys, time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

## 1.4 Graphics and options

In [None]:
sns.set()

## 1.5 Loading data

In [None]:
# !tree

![repo](./img/repo.jpg)

In [None]:
# our file

data = "./data/source/"
os.listdir(data)

In [None]:
# load dataframe

fn = data + 'twit_from_disaster_train.csv'
df = pd.read_csv(fn)

In [None]:
df.head()

# 2. First Tour

## 2.1 Display

In [None]:
df.head(5)

In [None]:
df.sample(20)

In [None]:
df.tail(5)

## 2.2 Structure

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.dtypes.value_counts()

In [None]:
df.nunique()

In [None]:
tmp = df.nunique()
tmp

In [None]:
tmp[tmp < 10]

In [None]:
tmp[tmp > 1000]

## 2.3 NaN and duplicated

In [None]:
df.isna().sum()

In [None]:
df.isna().mean()

In [None]:
df.isna().mean().sort_values()

In [None]:
sns.heatmap(df.isna())

In [None]:
fig, ax = plt.subplots(1,1, figsize=(20, 6))
sns.heatmap(df.isna())

In [None]:
df.duplicated().sum()

In [None]:
df.duplicated(subset="text").sum()

In [None]:
for col in df.columns : 
    n = df.duplicated(subset=col).sum()
    print(f"col : {col} -> duplicated : {n}")

In [None]:
df.duplicated(subset="text",keep="first")

In [None]:
idx = df.duplicated(subset="text",keep="first")
df.loc[idx, : ].sort_values("text")

## 2.4 Data inspection

In [None]:
df.describe(include=np.number)

In [None]:
df.describe(include=object)

# 3. Cleaning

## 3.1 Select columns

In [None]:
tmp = df.dtypes
tmp

In [None]:
cols = ["text", "target"]
df = df[cols]
df.head()

## 3.2 Drop NaN

In [None]:
df.isna().mean()

## 3.3 Duplicated

In [None]:
df.shape

In [None]:
idx = df.duplicated(subset="text",keep="first")
df.loc[idx, : ].sort_values("text")

In [None]:
df.drop_duplicates(subset="text", inplace=True)

# 4. Exploratory Data Analysis

## 4.1 Target

In [None]:
df.describe()

In [None]:
df.target.value_counts()

In [None]:
df.target.value_counts(normalize=True)

In [None]:
df.target.value_counts().plot(kind="pie")

## 4.2 Length

In [None]:
df.describe(include="object")

In [None]:
df.text.apply(lambda i : len(i))

In [None]:
tmp = df.text.str.len()
sns.boxplot(tmp)

In [None]:
sns.displot(tmp)

In [None]:
df["_len_txt"] = tmp

## 4.3 Multivariate analysis

In [None]:
corr = df.corr()
corr

In [None]:
sns.heatmap(corr)

In [None]:
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

fig, ax = plt.subplots(1,1, figsize=(7, 5))
ax = sns.heatmap(corr, mask=mask, vmin=-1, vmax=1, cmap="coolwarm", annot=True, fmt=".2f", square=True)

In [None]:
sns.pairplot(df)

In [None]:
sns.pairplot(df, hue="target")

In [None]:
df.sort_values("_len_txt").head(10)

In [None]:
df.sort_values("_len_txt").tail(10)

In [None]:
df.drop(columns="_len_txt", inplace=True)
df.head()

In [None]:
df.to_csv("data/cleaned/df_cleaned.csv", index=False)

## 4.4. Explore corpus and documents

###  4.4.1 print N random documents 

In [None]:
df_samp = df.sample(20)
df_samp

In [None]:
for idx, ser in df_samp.iterrows() : 
    print(ser["text"])

In [None]:
for idx, ser in df_samp.iterrows(): 
    txt_1 = f"target : {ser['target']}\n"
    print(txt_1 + ser['text']+ "..." + "\n\n")

### 4.4.2 print documents vs target

In [None]:
def print_sample(sub_df, n_samp=10) : 
    """print out n_samp documents from a corpus"""
    
    samp = sub_df.sample(n_samp)
    
    for idx, ser in samp.iterrows(): 

        txt_1 = f"target : {ser['target']}\n"
        print(txt_1+ser['text']+ "..." + "\n\n")

In [None]:
df_1 = df[df.target == 1]
df_1.head()

In [None]:
# print 10 docs of target == 1 

print_sample(df_1)

In [None]:
# print 10 docs of target == 0

print_sample(df[df.target==0])