In [4]:
import pandas as pd
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
test_labels = pd.read_csv('sample_submission.csv')['target']

In [6]:
def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'\d+', '', text)
  text = text.translate(str.maketrans('', '', string.punctuation))
  tokens = nltk.word_tokenize(text)
  stop_words = set(stopwords.words('english'))
  tokens = [token for token in tokens if not token in stop_words]
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(token) for token in tokens]
  return ' '.join(tokens)

train_data['text'] = train_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)

In [7]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
train_features = vectorizer.fit_transform(train_data['text'])
test_features = vectorizer.transform(test_data['text'])

In [8]:
X_train, X_val, y_train, y_val = train_test_split(train_features, train_data['target'], stratify=train_data['target'], test_size=0.2, random_state=42)

In [9]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)

In [10]:
predictions = model.predict(test_features)

submission = pd.DataFrame({'id': test_data['id'], 'target': predictions})
submission.to_csv('my_submission.csv', index=False)

predicted_labels = submission['target']
accuracy = accuracy_score(test_labels, predicted_labels)
print(f'Accuracy score: {accuracy}')

Accuracy score: 0.9834508121360711
