# Data Analysis

In [None]:
!pip install -U boto3 awswrangler

In [None]:
!pip install snowflake-connector-python

In [None]:
import pandas as pd
import re, os
import boto3
import nltk
from sagemaker import get_execution_role
import awswrangler as wr
from collections import Counter 
import warnings

In [None]:
from db_utils import *

In [None]:
warnings.filterwarnings("ignore")

In [None]:
query = """select * from"""

In [None]:
snowflake_connection = get_snowflake_connection(secret_name='scripts/snowflake',
                                                warehouse='WAREHOUSE_NAME', 
                                                database='DATABASE_NAME', 
                                                schema='SCHEMA_NAME')


corpus = pd.read_sql_query(query, snowflake_connection)
snowflake_connection.close()

In [None]:
print(corpus.shape)
corpus.head()

In [None]:
wr.s3.to_csv(
    df=corpus,
    path='s3://datasetS/text_summarization/corpus.csv'
)

___
# Data Cleaning
## Import data

In [None]:
role = get_execution_role()
data_location = 's3://datasetS/text_summarization/corpus.csv'

df = pd.DataFrame(pd.read_csv(data_location))

In [None]:
print(df.shape)
df.head()

## Data Pre-Processing and Cleaning

<mark>Remove all `NaN`</mark>

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()

<mark>Remove `\n`</mark>

In [None]:
df['TEXT'].values[100]

In [None]:
df = df.replace({'\n' : ' '}, regex=True)

In [None]:
df['TEXT'].values[100]

<mark>Remove Multiple Blanks in DataFrame</mark>

In [None]:
df = df.replace('\s+', ' ', regex=True)

In [None]:
df['TEXT'].values[100]

In [None]:
wr.s3.to_csv(
    df=df,
    path='s3://datasetS/text_summarization/corpus(cleaned).csv'
)

----

# Data Analysis

In [None]:
df.shape

Cleaned text example

In [None]:
df['TEXT'].values[4]

## Counting amount of words for the chosen input text

In [None]:
text = df['TEXT'].values[1]

In [None]:
# to count words in string
res = len(text.split(' '))

#print(text.split())

# printing result
print ("The number of words in string are : " + str(res))


## Counting amount of words for each text in the dataset

In [None]:
for i in range(1180): #1180
    text = df['TEXT'].values[i]
    # to count words in string
    res = len(text.split(' '))
    # printing result
    df.at[i, "NUM_WORDS"] = res
    rounded = round(res/100)*100
    df.at[i, "NUM_WORDS_ROUNDED"] = rounded

## Distribution of the words amount in the text and number of texts with this amount from the biggest to the smallest

In [None]:
df['NUM_WORDS_ROUNDED'].value_counts().sort_values(ascending=True).sort_index(ascending=False).plot(kind='bar', figsize=(16,8))

## Distribution of the words amount in the text and number of texts with this amount from the most frequent amount of words to less

In [None]:
df['NUM_WORDS_ROUNDED'].value_counts().plot(kind='bar', figsize=(16,8))

Some input examples don't have texts, just the **Title** or **part of the text**

In [None]:
df_0 = df[df['NUM_WORDS_ROUNDED'] == 0.0]
df_0

In [None]:
df_0['TEXT'].values[3]

In [None]:
df_2000 = df[df['NUM_WORDS_ROUNDED'] == 2000.0]
df_2000

In [None]:
df_2000['TEXT'].values[2]

## Counting how many input texts have more than N words and setting them as `TOO BIG` for our chosen model

In [None]:
for i in range(1180):
    if df['NUM_WORDS_ROUNDED'].values[i] <= 2000.0:
        df.at[i, "LENGHT"] = "OPTIMAL"
    else:
        df.at[i, "LENGHT"] = "TOO BIG"
        

In [None]:
df['LENGHT'].value_counts()

In [None]:
df['LENGHT'].value_counts().plot(kind='bar', figsize=(16,8))

In [None]:
wr.s3.to_csv(
    df=df,
    path='s3://datasetS/text_summarization/corpus_analyzed.csv'
)