In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv


In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')

from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train_data = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")
test_data = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")

In [4]:
def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    
    tokens = [token.lower() for token in tokens]
    
    tokens = [token for token in tokens if token.isalnum()]
    
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

def extract_features(texts):
    tfidf_vectorizer = TfidfVectorizer()
    
    features = tfidf_vectorizer.fit_transform(texts)
    
    return features

In [5]:
train_text_summaries = train_data.text
test_text_summaries = test_data.text

In [6]:
preprocessed_summaries_train = [preprocess_text(summary) for summary in train_text_summaries]
preprocessed_summaries_test = [preprocess_text(summary) for summary in test_text_summaries]

train_tfidf_features = extract_features(preprocessed_summaries_train)
test_tfidf_features = extract_features(preprocessed_summaries_test)

In [7]:
train_tfidf_features.shape

(7165, 11986)

In [8]:
test_tfidf_features.shape

(4, 2)

In [9]:
target_labels = train_data[['content', 'wording']]

In [10]:
regressor = XGBRegressor()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(train_tfidf_features, target_labels, test_size=0.2, random_state=42)

In [12]:
multi_output_regressor = MultiOutputRegressor(regressor)

In [13]:
multi_output_regressor.fit(X_train, y_train)

In [14]:
predictions = multi_output_regressor.predict(X_test)

In [15]:
mse = mean_squared_error(y_test, predictions, multioutput='raw_values')

In [16]:
print(mse)

[0.33713939 0.50192216]


In [17]:
mse.mean()

0.41953077494365976