In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train

Unnamed: 0,abstract,category
0,"In the last four years, daily deals have eme...",Applied
1,We propose a novel approach for density esti...,ML
2,"In this research, two-state Markov switching...",Applied
3,This article considers the estimation of the...,Applied
4,Markowitz's celebrated mean--variance portfo...,Applied
...,...,...
59,The use of Reinforcement Learning in real-wo...,ML
60,The paper introduces a penalized matrix esti...,ML
61,Cross-validation (CV) is widely used for tun...,Applied
62,We analyze the results of the German Team Ha...,Applied


In [4]:
test

Unnamed: 0,abstract
0,Probabilistic principal component analysis (...
1,Sensor-based degradation signals measure the...
2,The generic identification problem is to dec...
3,We introduce a new class of lower bounds on ...
4,Regularization is a powerful technique for e...
...,...
3931,The goal of cross-domain object matching (CD...
3932,Sequential prediction problems such as imita...
3933,Minimizing the relative inertia of a statist...
3934,The 2004 US Presidential Election cycle mark...


In [5]:
train['category'].unique()

array(['Applied', 'ML'], dtype=object)

In [6]:
train['abstract'] = train['abstract'].apply(lambda x: x.lower())
train['abstract'] = train['abstract'].apply(lambda x: x.lower())

In [7]:
X_train, X_val, y_train, y_val = train_test_split(train['abstract'], train['category'], test_size=0.25,
                                                  random_state=8)

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(test['abstract'])

In [9]:
model = RandomForestClassifier()

In [10]:
model.fit(X_train_tfidf, y_train)

In [11]:
val_predictions = model.predict(X_val_tfidf)
train_prediction = model.predict(X_train_tfidf)

In [12]:
print(confusion_matrix(y_train, train_prediction))

[[24  0]
 [ 0 24]]


In [13]:
print(classification_report(y_val, val_predictions))
print(confusion_matrix(y_val, val_predictions))

              precision    recall  f1-score   support

     Applied       0.62      0.83      0.71         6
          ML       0.88      0.70      0.78        10

    accuracy                           0.75        16
   macro avg       0.75      0.77      0.75        16
weighted avg       0.78      0.75      0.75        16

[[5 1]
 [3 7]]


In [14]:
test_predictions = model.predict(X_test_tfidf)

In [15]:
output = pd.DataFrame(test_predictions, columns=['prediction'])

In [16]:
output.to_csv('output.csv', index=False)