* This is an Exploratory Data Analysis for the Kaggle Competition 'Feedback Prize - Evaluating Student Writing'
* Competition Website: https://www.kaggle.com/c/feedback-prize-2021/overview
* Notebook Reference: https://www.kaggle.com/erikbruin/nlp-on-student-writing-eda

# 1. Import Libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from glob import glob #In Python, the glob module is used to retrieve files/pathnames matching a specified pattern.
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
style.use("fivethirtyeight")
from matplotlib.ticker import FuncFormatter
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
import spacy
from sklearn.feature_extraction.text import CountVectorizer
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 2. Read Data

In [2]:
train = pd.read_csv('../input/feedback-prize-2021/train.csv')
#train.describe()
#train.info()
train[['discourse_id','discourse_start','discourse_end']] = train[['discourse_id','discourse_start','discourse_end']].astype(int)

sample_submission = pd.read_csv('../input/feedback-prize-2021/sample_submission.csv')

#The glob module finds all the pathnames matching a specified pattern according to the rules used by the Unix shell
train_text = glob('../input/feedback-prize-2021/train/*.txt')
test_text = glob('../input/feedback-prize-2021/test/*.txt')

In [3]:
! cat ../input/feedback-prize-2021/train/0000D23A521A.txt

In [4]:
print(len(train_text))
train['id'].nunique()

In [5]:
print(len(train_text))
train.query('id == "0000D23A521A"')

# Length of Discourse Type and Predictionstring

* length of discourse_text and predictionstring should be same
* But there are quite many are not same, difference in 1, this is because of the token calculation

In [6]:
train['discourse_len'] = train['discourse_text'].apply(lambda x: len(x.split()))

train["pred_len"] = train["predictionstring"].apply(lambda x: len(x.split()))


cols_to_display = ['discourse_id', 'discourse_text', 'discourse_type','predictionstring', 'discourse_len', 'pred_len']
train[cols_to_display].head()

In [7]:
train.query('discourse_len != pred_len')

# 3. Frequency of discourse type and average length of the text

In [8]:
fig = plt.figure(figsize = (12,8))

ax1 = fig.add_subplot(211)
train.groupby('discourse_type')['discourse_type'].count().sort_values().plot(kind = 'barh')
ax1.get_xaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x),','))) #add thousands separator
ax1.set_title("Frequency of Discourse Type in all Essays")
ax1.set_xlabel("Frequency", fontsize = 10)
ax1.set_ylabel("")

ax2 = fig.add_subplot(212)
train.groupby('discourse_type')['discourse_len'].mean().sort_values().plot(kind = 'barh')
#ax2.get_xaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x),','))) #add thousands separator
ax2.set_title("Average number of words verses discourse type")
ax2.set_xlabel("Average number of words", fontsize = 10)
ax2.set_ylabel("")

plt.tight_layout(pad=2)
plt.show()

# Percent of the discourse type present in essays

In [9]:
fig = plt.figure(figsize = (12,8))
ave_per_essay = train['discourse_type_num'].value_counts(ascending = True).rename_axis('discourse_type_num').reset_index(name='count')
# keep the original index column and rename it to be 'count'

ave_per_essay['perc'] = round((ave_per_essay['count'] / train['id'].nunique()),3)
ave_per_essay = ave_per_essay.set_index('discourse_type_num')
ax = ave_per_essay.query('perc > 0.03')['perc'].plot(kind='barh')
ax.set_title("discourse_type_num: Percent present in essays", fontsize=20, fontweight = 'bold')
ax.bar_label(ax.containers[0],label_type='edge') #label column value
ax.set_xlabel('Percent')
ax.set_ylabel('')
plt.show()


# Average positions of the discourse start and end

In [10]:
#Use pandas default plot to plot
data = train.groupby('discourse_type')[['discourse_end','discourse_start']].mean().reset_index().sort_values(by = 'discourse_start', ascending = False)
ax=data.plot(x = 'discourse_type',
          kind = 'barh',
         stacked=False,
         title = 'Average start and end position absolute',
         figsize =(12,4))

for container in ax.containers:
    ax.bar_label(container, fontsize=10)
plt.show()

# 4. Investigation the gaps between annotations (text not used as discourse_text)

* There are quite many chunks that are not classified

In [11]:
len_dict = {}
word_dict = {}
for t in tqdm(train_text):
    with open(t, "r") as txt_file:
        myid = t.split("/")[-1].replace(".txt","")
        data = txt_file.read()
        mylen = len(data.strip()) #character len of the essay
        myword = len(data.split())
        len_dict[myid] = mylen
        word_dict[myid] = myword
        
train['essay_len'] = train['id'].map(len_dict)
train['essay_words'] = train['id'].map(word_dict) 
    

* Calculate the gaps

In [12]:
#initialize the column
train['gap_length'] = np.nan

#set the first one
train.loc[0, 'gap_length'] = 7

#loop over rest
for i in tqdm(range(1, len(train))):
    #gap if difference is not 1 within an essay
    if ((train.loc[i, 'id'] == train.loc[i-1,"id"])\
       and (train.loc[i, "discourse_start"] - train.loc[i-1,'discourse_end']>1)):
        train.loc[i, 'gap_length'] = train.loc[i, 'discourse_start']-train.loc[i-1,"discourse_end"]-2
        
    elif ((train.loc[i,'id'] != train.loc[i-1,'id'])\
         and (train.loc[i,'discourse_start'] != 0)):
        train.loc[i, 'gap_length'] = train.loc[i, 'discourse_start']-1
    
    
        


* text after the last discourse

In [13]:
last_ones = train.drop_duplicates(subset = 'id', keep = 'last')

#np.where(condition, x, y), choose x if condition else y
last_ones['gap_end_length'] = np.where((last_ones.discourse_end < last_ones.essay_len),\
                                       (last_ones.essay_len - last_ones.discourse_end),\
                                       np.nan)

cols_to_merge = ['id','discourse_id','gap_end_length']
train = train.merge(last_ones[cols_to_merge], on=['id','discourse_id'], how = 'left')

In [14]:
col_to_show = ['id','discourse_start', 'discourse_end','discourse_type','essay_len','gap_length', 'gap_end_length']
train[col_to_show].query('id == "4C471936CD75"')

In [15]:
print(f"Besides the {len(train)} discourse texts, there are {len(train.query('gap_length.notna()', engine='python'))+len(train.query('gap_end_length.notna()', engine = 'python'))} pieces of text not classified. ")

In [16]:
# huge gaps
train.sort_values(by = 'gap_length', ascending = False)[col_to_show]

In [17]:
train.sort_values(by='gap_end_length', ascending = False)[col_to_show]

* Below, you can see a histogram of the length of all gaps with the outliers taken out (all gaps longer than 300 characters).

In [18]:
all_gaps = (train.gap_length[~train.gap_length.isna()]).append((train.gap_end_length[~train.gap_end_length.isna()]), ignore_index = True)
#filter out outliers
all_gaps = all_gaps[all_gaps<300]

fig = plt.figure(figsize = (12,6))
all_gaps.plot(kind = 'hist', bins=100)
plt.title("Histogram of gap length (gaps up to 300 characters only)")
plt.xticks(rotation=0)
plt.xlabel("Length of gaps in characters")
plt.show()

# Essays with large percentage of text not classified

* More than 80 percent of the text not classified

In [67]:
total_gaps = train.groupby('id').agg({'essay_len':'first',\
                                       'gap_length':'sum',\
                                       'gap_end_length':'sum'})
total_gaps['perc_not_classified'] = round(((total_gaps.gap_length + total_gaps.gap_end_length)/total_gaps.essay_len),2)
total_gaps.sort_values('perc_not_classified', ascending = False).query("perc_not_classified > 0.8")

# 5. Color printing essays including the gaps

In [20]:
def add_gap_rows(essay):
    cols_to_keep = ['discourse_start', 'discourse_end', 'discourse_type', 'gap_length', 'gap_end_length']
    df_essay = train.query('id == @essay')[cols_to_keep].reset_index(drop = True)

    #index new row
    insert_row = len(df_essay)
   
    for i in range(1, len(df_essay)):          
        if df_essay.loc[i,"gap_length"] >0:
            if i == 0:
                start = 0 #as there is no i-1 for first row
                end = df_essay.loc[0, 'discourse_start'] -1
                disc_type = "Nothing"
                gap_end = np.nan
                gap = np.nan
                df_essay.loc[insert_row] = [start, end, disc_type, gap, gap_end]
                insert_row += 1
            else:
                start = df_essay.loc[i-1, "discourse_end"] + 1
                end = df_essay.loc[i, 'discourse_start'] -1
                disc_type = "Nothing"
                gap_end = np.nan
                gap = np.nan
                df_essay.loc[insert_row] = [start, end, disc_type, gap, gap_end]
                insert_row += 1

    df_essay = df_essay.sort_values(by = "discourse_start").reset_index(drop=True)

    #add gap at end
    if df_essay.loc[(len(df_essay)-1),'gap_end_length'] > 0:
        start = df_essay.loc[(len(df_essay)-1), "discourse_end"] + 1
        end = start + df_essay.loc[(len(df_essay)-1), 'gap_end_length']
        disc_type = "Nothing"
        gap_end = np.nan
        gap = np.nan
        df_essay.loc[insert_row] = [start, end, disc_type, gap, gap_end]
        
    return(df_essay)

add_gap_rows("129497C3E0FC")

In [21]:
def print_colored_essay(essay):
    df_essay = add_gap_rows(essay)
    #code from https://www.kaggle.com/odins0n/feedback-prize-eda, but adjusted to df_essay
    essay_file = "../input/feedback-prize-2021/train/" + essay + ".txt"

    ents = []
    for i, row in df_essay.iterrows():
        ents.append({
                        'start': int(row['discourse_start']), 
                         'end': int(row['discourse_end']), 
                         'label': row['discourse_type']
                    })

    with open(essay_file, 'r') as file: data = file.read()

    doc2 = {
        "text": data,
        "ents": ents,
    }

    colors = {'Lead': '#EE11D0','Position': '#AB4DE1','Claim': '#1EDE71','Evidence': '#33FAFA','Counterclaim': '#4253C1','Concluding Statement': 'yellow','Rebuttal': 'red'}
    options = {"ents": df_essay.discourse_type.unique().tolist(), "colors": colors}
    spacy.displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True);

In [22]:
print_colored_essay("7330313ED3F0")

# 6. Most Used Words per Discourse Type

In [23]:
train['discourse_text'] = train['discourse_text'].str.lower()

#get stopwords from nltk library
stop_english = stopwords.words("english")
other_words_to_take_out = ['school', 'students', 'people', 'would', 'could', 'many']
stop_english.extend(other_words_to_take_out)


#put dataframe of Top-10 words in dict for all discourse types
counts_dict = {}

for dt in tqdm(train['discourse_type'].unique()):
    df = train.query("discourse_type == @dt") #dt has been assigned, so use @dt
    text = df.discourse_text.apply(lambda x: x.split()).tolist() #convert each text into list, then combine all text list into one list. [[],[],[]]
    text = [item for elem in text for item in elem] #Combine all the words
    df1 = pd.Series(text).value_counts().to_frame().reset_index()
    df1.columns = ['Word', 'Frequency']
    df1 = df1[~df1.Word.isin(stop_english)].head(10)
    #print(df1)
    df1 = df1.set_index("Word").sort_values(by = "Frequency", ascending = True)
    #print(df1)
    counts_dict[dt] = df1

plt.figure(figsize=(15,12))
plt.subplots_adjust(hspace=0.5)
keys = list(counts_dict.keys())

for n,key in enumerate(keys):
    ax = plt.subplot(4,2,n+1)
    ax.set_title(f"Most used words in {key}")
    counts_dict[keys[n]].plot(ax=ax, kind = 'barh')
    plt.ylabel("")
plt.show()

# Making n_grams for each discourse type

In [54]:
def get_n_grams(n_grams,top_n = 10):
    df_words = pd.DataFrame() #empty dataframe
    for dt in tqdm(train['discourse_type'].unique()):
        df = train.query('discourse_type == @dt') #take out all dt type text, like Lead
        texts = df['discourse_text'].tolist() #transform dataframe to list, [[],[],[]...]
        vec = CountVectorizer(lowercase = True, stop_words='english', ngram_range = (n_grams, n_grams)).fit(texts) #Learn a vocabulary dictionary of all tokens in the raw documents.
        bag_of_words = vec.transform(texts) #Transform documents to document-term matrix. Each text count words in global words vector
        sum_words = bag_of_words.sum(axis = 0)
        words_freq = [(word, sum_words[0,idx]) for word, idx in vec.vocabulary_.items()]
        cvec_df = pd.DataFrame.from_records(words_freq, columns = ['words','counts']).sort_values(by="counts", ascending = False) #from list to dataframe
        cvec_df.insert(0, "Discourse_type", dt) #insert discourse_type column into dataframe
        cvec_df = cvec_df.iloc[:top_n,:] # select top_n words
        df_words = df_words.append(cvec_df) #append to global df_words
    return df_words

# Bigram

In [55]:
bigrams = get_n_grams(2, 10)
bigrams.head()

In [62]:
def plot_ngram(df, type = 'bigram'):
    plt.figure(figsize = (15,12))
    plt.subplots_adjust(hspace = 0.5)
    
    for n, dt in enumerate(df.Discourse_type.unique()):
        ax = plt.subplot(4,2,n+1)
        ax.set_title(f"Most used words {type} in {dt}")
        #query the discourse_type, take 'words' and 'counts' column, let 'words' as index(when plot, index will be ylabel)
        data = df.query('Discourse_type == @dt')[['words','counts']].set_index('words').sort_values(by = 'counts', ascending = True)
        data.plot(ax=ax, kind = 'barh')
        plt.ylabel("") #index value will be as the label
    plt.tight_layout() #Adjust the padding between and around subplots.
    plt.show()
    
plot_ngram(bigrams) 

# Trigrams

In [63]:
trigrams = get_n_grams(n_grams = 3, top_n=10)
plot_ngram(trigrams, type = "trigrams")