# Imports

In [4]:
import numpy as np
import os
import pandas as pd


# Load Data

In [5]:
train_df = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')
test_df = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')

# -- STEFANOS -- Replicate Data

In [6]:
df = train_df
intended_df_size_in_MB = 256
factor = intended_df_size_in_MB*(2**20)/(train_df.memory_usage(index=True).sum()+test_df.memory_usage(index=True).sum())
if int(factor) > 0:
    df = pd.concat([df]*int(factor), ignore_index=True)
else:
    rowCount = int(df.shape[0]*factor)
    df = df[0:rowCount]
train_df = df

df = test_df
if int(factor) > 0:
    df = pd.concat([df]*int(factor), ignore_index=True)
else:
    rowCount = int(df.shape[0]*factor)
    df = df[0:rowCount]
test_df = df
    
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4188681 entries, 0 to 4188680
Data columns (total 8 columns):
 #   Column       Dtype  
---  ------       -----  
 0   text_id      object 
 1   full_text    object 
 2   cohesion     float64
 3   syntax       float64
 4   vocabulary   float64
 5   phraseology  float64
 6   grammar      float64
 7   conventions  float64
dtypes: float64(6), object(2)
memory usage: 255.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3213 entries, 0 to 3212
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text_id    3213 non-null   object
 1   full_text  3213 non-null   object
dtypes: object(2)
memory usage: 50.3+ KB


Let's see a row from each dataset.

In [4]:
train_df.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [5]:
test_df.head()

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...


Then the size of each dataset.

In [6]:
len(train_df), len(test_df)

(1955500, 1500)

In [7]:
LABEL_COLUMNS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

# Text Examples

## Random Examples

In [8]:
## ORIGINAL:
# texts = train_df.sample(frac=1, random_state=420).head(4)
## TO RUN WITH KOALAS
texts = train_df.sample(frac=1.0, random_state=420).head(4)

## Lowest Scoring Examples

In [9]:
train_df['total_score'] = train_df[LABEL_COLUMNS].sum(axis=1)
lowest_df = train_df.sort_values('total_score').head(4)

## Highest Scoring Examples

In [10]:
train_df['total_score'] = train_df[LABEL_COLUMNS].sum(axis=1)
highest_df = train_df.sort_values('total_score', ascending=False).head(4)

# Text Overview

## Word Count

In [11]:
train_df['word_count'] = train_df.full_text.apply(lambda x: len(x.split()))

Mean word count:

In [12]:
train_df['word_count'].mean()

430.4929685502429

Max word count:

In [13]:
train_df['word_count'].max()

1260