## Cleaning

In [37]:
import pandas as pd

In [38]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

In [39]:
raw = pd.read_csv('../data/raw/ia_human_texts.csv')
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1367 entries, 0 to 1366
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   text_content          1367 non-null   object 
 1   content_type          1367 non-null   object 
 2   word_count            1367 non-null   int64  
 3   character_count       1367 non-null   int64  
 4   sentence_count        1367 non-null   int64  
 5   lexical_diversity     1367 non-null   float64
 6   avg_sentence_length   1367 non-null   float64
 7   avg_word_length       1367 non-null   float64
 8   punctuation_ratio     1367 non-null   float64
 9   flesch_reading_ease   1288 non-null   float64
 10  gunning_fog_index     1332 non-null   float64
 11  grammar_errors        1367 non-null   int64  
 12  passive_voice_ratio   1336 non-null   float64
 13  predictability_score  1367 non-null   float64
 14  burstiness            1367 non-null   float64
 15  sentiment_score      

In [40]:
interim = raw.dropna()
interim.to_csv('../data/interim/ia_human_texts_int.csv', index=False)
interim.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1179 entries, 0 to 1366
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   text_content          1179 non-null   object 
 1   content_type          1179 non-null   object 
 2   word_count            1179 non-null   int64  
 3   character_count       1179 non-null   int64  
 4   sentence_count        1179 non-null   int64  
 5   lexical_diversity     1179 non-null   float64
 6   avg_sentence_length   1179 non-null   float64
 7   avg_word_length       1179 non-null   float64
 8   punctuation_ratio     1179 non-null   float64
 9   flesch_reading_ease   1179 non-null   float64
 10  gunning_fog_index     1179 non-null   float64
 11  grammar_errors        1179 non-null   int64  
 12  passive_voice_ratio   1179 non-null   float64
 13  predictability_score  1179 non-null   float64
 14  burstiness            1179 non-null   float64
 15  sentiment_score       1179

In [41]:
interim.head()

Unnamed: 0,text_content,content_type,word_count,character_count,sentence_count,lexical_diversity,avg_sentence_length,avg_word_length,punctuation_ratio,flesch_reading_ease,gunning_fog_index,grammar_errors,passive_voice_ratio,predictability_score,burstiness,sentiment_score,label
0,Score each cause. Quality throughout beautiful...,academic_paper,288,1927,54,0.9514,5.33,5.69,0.028,53.08,7.41,1,0.1041,105.86,0.5531,0.2034,1
1,Board its rock. Job worker break tonight coupl...,essay,253,1719,45,0.9723,5.62,5.8,0.0262,50.32,8.1,6,0.2045,100.29,0.5643,0.4854,1
2,Way debate decision produce. Dream necessary c...,academic_paper,420,2849,75,0.9071,5.6,5.79,0.0263,46.86,7.86,5,0.2308,96.88,0.4979,-0.2369,1
5,Spend value return couple. Marriage method mat...,blog_post,198,1383,37,0.9596,5.35,5.99,0.0268,43.31,6.99,0,0.0871,36.96,0.2328,-0.2755,1
6,Land region back nor article natural measure. ...,blog_post,84,551,15,0.9762,5.6,5.57,0.0272,61.16,6.53,2,0.0988,53.49,0.558,0.9505,1


## Enconding

In [42]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
interim.loc[:, 'content_type_cd'] = le.fit_transform(interim['content_type'])
content_type_codes = dict(zip(le.classes_, le.transform(le.classes_)))
content_type_codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interim.loc[:, 'content_type_cd'] = le.fit_transform(interim['content_type'])


{'academic_paper': np.int64(0),
 'article': np.int64(1),
 'blog_post': np.int64(2),
 'creative_writing': np.int64(3),
 'essay': np.int64(4),
 'news_article': np.int64(5),
 'product_review': np.int64(6),
 'social_media': np.int64(7)}

## Normalizing

In [51]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

interim_norm = interim.drop(columns=['text_content', 'content_type'])
interim_norm = interim_norm.astype('float64')

interim_norm.loc[:, interim_norm.columns] = scaler.fit_transform(interim_norm)
interim_norm = pd.concat([interim_norm, interim['text_content']], axis=1)
interim_norm.head()


Unnamed: 0,word_count,character_count,sentence_count,lexical_diversity,avg_sentence_length,avg_word_length,punctuation_ratio,flesch_reading_ease,gunning_fog_index,grammar_errors,passive_voice_ratio,predictability_score,burstiness,sentiment_score,label,content_type_cd,text_content
0,0.647727,0.648035,0.646341,0.6112,0.466,0.3903,0.165385,0.692437,0.232846,0.1,0.2705,0.859159,0.64781,0.602797,1.0,0.0,Score each cause. Quality throughout beautiful...
1,0.568182,0.577575,0.536585,0.7784,0.524,0.415704,0.130769,0.673898,0.258718,0.6,0.7725,0.803403,0.663891,0.744136,1.0,0.571429,Board its rock. Job worker break tonight coupl...
2,0.947727,0.960366,0.902439,0.2568,0.52,0.413395,0.132692,0.650658,0.249719,0.5,0.904,0.769269,0.568557,0.382117,1.0,0.0,Way debate decision produce. Dream necessary c...
5,0.443182,0.463753,0.439024,0.6768,0.47,0.459584,0.142308,0.626814,0.217098,0.0,0.1855,0.169469,0.18794,0.362771,1.0,0.285714,Spend value return couple. Marriage method mat...
6,0.184091,0.181911,0.170732,0.8096,0.52,0.362587,0.15,0.746709,0.19985,0.2,0.244,0.334935,0.654846,0.977245,1.0,0.285714,Land region back nor article natural measure. ...


In [52]:
interim_norm.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1179 entries, 0 to 1366
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   word_count            1179 non-null   float64
 1   character_count       1179 non-null   float64
 2   sentence_count        1179 non-null   float64
 3   lexical_diversity     1179 non-null   float64
 4   avg_sentence_length   1179 non-null   float64
 5   avg_word_length       1179 non-null   float64
 6   punctuation_ratio     1179 non-null   float64
 7   flesch_reading_ease   1179 non-null   float64
 8   gunning_fog_index     1179 non-null   float64
 9   grammar_errors        1179 non-null   float64
 10  passive_voice_ratio   1179 non-null   float64
 11  predictability_score  1179 non-null   float64
 12  burstiness            1179 non-null   float64
 13  sentiment_score       1179 non-null   float64
 14  label                 1179 non-null   float64
 15  content_type_cd       1179

In [53]:
interim_norm.to_csv('../data/interim/ia_human_texts_int.csv', index=False)