In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv


In [2]:
df=pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")
df.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')


def cleaning(txt):
    txt = txt.lower()
    # for special caracters
    txt = re.sub(r'[^a-z\s]', '', txt)
    stop_words = set(stopwords.words('english')) # for stop words
    words = nltk.word_tokenize(txt)
    txt = ' '.join([word for word in words if word not in stop_words])
    return txt

df['text_clean']=df['full_text'].apply(cleaning)
df

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


Unnamed: 0,essay_id,full_text,score,text_clean
0,000d118,Many people have car where they live. The thin...,3,many people car live thing dont know use car a...
1,000fe60,I am a scientist at NASA that is discussing th...,3,scientist nasa discussing face mars explaining...
2,001ab80,People always wish they had the same technolog...,4,people always wish technology seen movies best...
3,001bdc0,"We all heard about Venus, the planet without a...",4,heard venus planet without almost oxygen earth...
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,dear state senator letter argue favor keeping ...
...,...,...,...,...
17302,ffd378d,"the story "" The Challenge of Exploing Venus "" ...",2,story challenge exploing venus informative pie...
17303,ffddf1f,Technology has changed a lot of ways that we l...,4,technology changed lot ways live today nowaday...
17304,fff016d,If you don't like sitting around all day than ...,2,dont like sitting around day great opportunity...
17305,fffb49b,"In ""The Challenge of Exporing Venus,"" the auth...",1,challenge exporing venus author suggests study...


In [4]:
# Tokenizing the text and extract basic features
def basic_feature_extraction(tt):
    #calculate the length of the text
    words = word_tokenize(tt)
    text_length = len(words)
    
    #calculate the avg word length
    avg_length = sum(len(word) for word in words) / text_length if text_length > 0 else 0
    
    #calculate the TTR: Type-Token Ratio : Lexical Diversity
    types=len(set(words))
    ratio=types / text_length
    
    return text_length, avg_length, ratio


df['counts'],df['avg_length'],df['TTR']=zip(*df['text_clean'].apply(basic_feature_extraction))
df

Unnamed: 0,essay_id,full_text,score,text_clean,counts,avg_length,TTR
0,000d118,Many people have car where they live. The thin...,3,many people car live thing dont know use car a...,239,5.723849,0.698745
1,000fe60,I am a scientist at NASA that is discussing th...,3,scientist nasa discussing face mars explaining...,136,5.492647,0.654412
2,001ab80,People always wish they had the same technolog...,4,people always wish technology seen movies best...,273,5.919414,0.593407
3,001bdc0,"We all heard about Venus, the planet without a...",4,heard venus planet without almost oxygen earth...,246,6.581301,0.662602
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,dear state senator letter argue favor keeping ...,182,6.956044,0.554945
...,...,...,...,...,...,...,...
17302,ffd378d,"the story "" The Challenge of Exploing Venus "" ...",2,story challenge exploing venus informative pie...,76,6.039474,0.802632
17303,ffddf1f,Technology has changed a lot of ways that we l...,4,technology changed lot ways live today nowaday...,300,6.216667,0.586667
17304,fff016d,If you don't like sitting around all day than ...,2,dont like sitting around day great opportunity...,93,5.236559,0.655914
17305,fffb49b,"In ""The Challenge of Exporing Venus,"" the auth...",1,challenge exporing venus author suggests study...,128,6.796875,0.734375


In [5]:
#grammatical complexity
from nltk.tokenize import sent_tokenize
import spacy

def Grammatical(tx):
    # avg sentence length
    sents = sent_tokenize(tx)
    lengths = [len(word_tokenize(se)) for se in sents]
    sent_avg_length = sum(lengths) / len(sents)
    
    return sent_avg_length

df['sent_avg_length']=df['text_clean'].apply(Grammatical)

In [6]:
def score(tx):
    #complexity index :Flesch Reading Ease Score
    sents = sent_tokenize(tx)
    sentence_nbr=len(sents)
    words_nbr=len(word_tokenize(tx))
    syllabes_nbr = sum([len(re.findall(r'[aeiouy]+', mot, re.I)) for mot in word_tokenize(tx)])
    score_flesch = 206.835 - 1.015 * (words_nbr / sentence_nbr) - 84.6 * (syllabes_nbr / words_nbr)
    return score_flesch


df['score_flesch']=df['text_clean'].apply(score)

In [7]:
df

Unnamed: 0,essay_id,full_text,score,text_clean,counts,avg_length,TTR,sent_avg_length,score_flesch
0,000d118,Many people have car where they live. The thin...,3,many people car live thing dont know use car a...,239,5.723849,0.698745,239.0,-198.578452
1,000fe60,I am a scientist at NASA that is discussing th...,3,scientist nasa discussing face mars explaining...,136,5.492647,0.654412,136.0,-86.097647
2,001ab80,People always wish they had the same technolog...,4,people always wish technology seen movies best...,273,5.919414,0.593407,273.0,-246.587473
3,001bdc0,"We all heard about Venus, the planet without a...",4,heard venus planet without almost oxygen earth...,246,6.581301,0.662602,246.0,-236.128171
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,dear state senator letter argue favor keeping ...,182,6.956044,0.554945,182.0,-202.875220
...,...,...,...,...,...,...,...,...,...
17302,ffd378d,"the story "" The Challenge of Exploing Venus "" ...",2,story challenge exploing venus informative pie...,76,6.039474,0.802632,76.0,-42.844474
17303,ffddf1f,Technology has changed a lot of ways that we l...,4,technology changed lot ways live today nowaday...,300,6.216667,0.586667,300.0,-277.581000
17304,fff016d,If you don't like sitting around all day than ...,2,dont like sitting around day great opportunity...,93,5.236559,0.655914,93.0,-34.927742
17305,fffb49b,"In ""The Challenge of Exporing Venus,"" the auth...",1,challenge exporing venus author suggests study...,128,6.796875,0.734375,128.0,-119.383437


In [8]:
data=df[['essay_id','score','counts','avg_length','TTR','sent_avg_length','score_flesch']]
data_copy=data.copy()
y=data_copy['score']
x=data_copy.drop('score',axis=1)
x=x.drop('essay_id',axis=1)
x

Unnamed: 0,counts,avg_length,TTR,sent_avg_length,score_flesch
0,239,5.723849,0.698745,239.0,-198.578452
1,136,5.492647,0.654412,136.0,-86.097647
2,273,5.919414,0.593407,273.0,-246.587473
3,246,6.581301,0.662602,246.0,-236.128171
4,182,6.956044,0.554945,182.0,-202.875220
...,...,...,...,...,...
17302,76,6.039474,0.802632,76.0,-42.844474
17303,300,6.216667,0.586667,300.0,-277.581000
17304,93,5.236559,0.655914,93.0,-34.927742
17305,128,6.796875,0.734375,128.0,-119.383437


In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
rf_model = RandomForestRegressor()
rf_model.fit(x, y)
selector = RFE(rf_model, n_features_to_select=3)
selector = selector.fit(x, y)
print(selector.support_)
print(selector.ranking_)

[ True False False  True  True]
[1 3 2 1 1]


In [10]:
# we will select the following features : counts , sent_avg_length , score_flesch
x=x[['counts','sent_avg_length','score_flesch']]
# model training
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingRegressor
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
svm = SVR()
svm.fit(X_train, y_train)
y_train_predict=svm.predict(X_train)
y_train_predict= np.floor(y_train_predict)
# error training 
train_accuracy = accuracy_score(y_train, y_train_predict)
train_error=1-train_accuracy
print(f"training error (train): {train_error}")
#**************************************************************gbm*********************
gbm = GradientBoostingRegressor()
gbm.fit(X_train, y_train)
y_train_predict_gbm = gbm.predict(X_train)
y_train_predict_gbm= np.floor(y_train_predict_gbm)
# error training 
train_accuracy_gbm = accuracy_score(y_train, y_train_predict_gbm)
train_error_gbm=1-train_accuracy_gbm
print(f"training error (train): {train_error_gbm}")
#******************************************************************* Random Forestes ********************
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_train_predict_RF=rf.predict(X_train)
y_train_predict_RF= np.floor(y_train_predict_RF)
train_accuracy_RF = accuracy_score(y_train, y_train_predict_RF)
train_error_RF=1-train_accuracy_RF
print(f"training error (train): {train_error_RF}")

training error (train): 0.6177681473456121
training error (train): 0.6375586854460094
training error (train): 0.554785120982304


**From the 3 models we will use the Random Forests Model because it shows the lower error in training step**

In [11]:
y_test_pred_RF=rf.predict(X_test)
y_test_pred_RF= np.floor(y_test_pred_RF)
test_accuracy_RF = accuracy_score(y_test, y_test_pred_RF)
test_error_RF=1-test_accuracy_RF
test_error_RF

0.6377816291161178

**for the test dataset**

In [12]:
df_test=pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
#doing the same operation like before
df_test['text_clean']=df_test['full_text'].apply(cleaning)
df_test['counts'],df_test['avg_length'],df_test['TTR']=zip(*df_test['text_clean'].apply(basic_feature_extraction))
df_test['sent_avg_length']=df_test['text_clean'].apply(Grammatical)
df_test['score_flesch']=df_test['text_clean'].apply(score)

#prediction
t=df_test[['counts','sent_avg_length','score_flesch']]
test_label=rf.predict(t)
test_label=np.floor(test_label)
test_label

array([3., 2., 4.])

In [13]:
#save the sample_submission
save_df=df_test[['essay_id','full_text']]
test_label=np.round(test_label).astype(int)
save_df['score']=test_label
save_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  save_df['score']=test_label


Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,2
2,001ab80,People always wish they had the same technolog...,4


In [14]:
submission_df=pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')
submission_df['score']=test_label
submission_df.to_csv("submission.csv", index=False)