### Set-up

In [2]:
import pandas as pd
import numpy as np

In [3]:
import kagglehub
path = kagglehub.dataset_download('abhi8923shriv/sentiment-analysis-dataset')

### Loading dataset

In [4]:
train_dataset = path+'/train.csv'
test_dataset = path+'/test.csv'

In [5]:
train_df = pd.read_csv(train_dataset, encoding='ISO-8859-1')
test_df = pd.read_csv(test_dataset, encoding='ISO-8859-1')

In [9]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26
...,...,...,...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,night,31-45,Ghana,31072940,227540.0,137
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,morning,46-60,Greece,10423054,128900.0,81
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,noon,60-70,Grenada,112523,340.0,331
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,night,70-100,Guatemala,17915568,107160.0,167


### Profiling with ydata

In [6]:
from ydata_profiling import ProfileReport
profile = ProfileReport(train_df, title="Profiling Report")

In [None]:
# saving report to html file
profile.to_file("report.html")



Summarize dataset: 100%|██████████| 29/29 [00:01<00:00, 21.32it/s, Completed]                                 
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.58s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  3.75it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 284.65it/s]


In [None]:
# display the html report
profile

Summarize dataset: 100%|██████████| 29/29 [00:02<00:00, 14.36it/s, Completed]                                 
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.80s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  3.51it/s]




### Find missing row

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            27481 non-null  object 
 1   text              27480 non-null  object 
 2   selected_text     27480 non-null  object 
 3   sentiment         27481 non-null  object 
 4   Time of Tweet     27481 non-null  object 
 5   Age of User       27481 non-null  object 
 6   Country           27481 non-null  object 
 7   Population -2020  27481 non-null  int64  
 8   Land Area (Km²)   27481 non-null  float64
 9   Density (P/Km²)   27481 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 2.1+ MB


In [None]:
# there should only be 1 missing row accoding to ydata profile

train_df[train_df.isnull().T.any()]

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
314,fdb77c3752,,,neutral,night,31-45,Namibia,2540905,823000.0,3


### Remove missing row from data

In [9]:
nonmissing_df = train_df.drop(labels=314, axis=0)
nonmissing_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27480 entries, 0 to 27480
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            27480 non-null  object 
 1   text              27480 non-null  object 
 2   selected_text     27480 non-null  object 
 3   sentiment         27480 non-null  object 
 4   Time of Tweet     27480 non-null  object 
 5   Age of User       27480 non-null  object 
 6   Country           27480 non-null  object 
 7   Population -2020  27480 non-null  int64  
 8   Land Area (Km²)   27480 non-null  float64
 9   Density (P/Km²)   27480 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 2.3+ MB


In [10]:
# reperform report generation with missing values removed
new_profile = ProfileReport(nonmissing_df, title="Profiling Report")

In [11]:
# saving report to html file
new_profile.to_file("new_report.html")

# displaying the new report
new_profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

