In [64]:
import pandas as pd
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [65]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
test_labels = pd.read_csv('sample_submission.csv')['target']

In [66]:
def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'\d+', '', text)
  text = text.translate(str.maketrans('', '', string.punctuation))
  tokens = nltk.word_tokenize(text)
  stop_words = set(stopwords.words('english'))
  tokens = [token for token in tokens if not token in stop_words]
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(token) for token in tokens]
  return ' '.join(tokens)

train_data['text'] = train_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)

In [67]:
train_data.head(40)

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquake may allah forgive u,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,resident asked shelter place notified officer ...,1
3,6,,,people receive wildfire evacuation order calif...,1
4,7,,,got sent photo ruby alaska smoke wildfire pour...,1
5,8,,,rockyfire update california hwy closed directi...,1
6,10,,,flood disaster heavy rain cause flash flooding...,1
7,13,,,im top hill see fire wood,1
8,14,,,there emergency evacuation happening building ...,1
9,15,,,im afraid tornado coming area,1


In [68]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
train_features = vectorizer.fit_transform(train_data['text'])
test_features = vectorizer.transform(test_data['text'])

In [69]:
print(train_features)

  (0, 1307)	0.3668328300610611
  (0, 16217)	0.3668328300610611
  (0, 49112)	0.3668328300610611
  (0, 13416)	0.3668328300610611
  (0, 20317)	0.3507436574321259
  (0, 1305)	0.30296907485528024
  (0, 16216)	0.2407251683214562
  (0, 49104)	0.2568143409503914
  (0, 13414)	0.3507436574321259
  (1, 52032)	0.3346728758298464
  (1, 51046)	0.3346728758298464
  (1, 35571)	0.3346728758298464
  (1, 42085)	0.3346728758298464
  (1, 20271)	0.3346728758298464
  (1, 7814)	0.2642270399699744
  (1, 52031)	0.3346728758298464
  (1, 51045)	0.3346728758298464
  (1, 35561)	0.23663604712028144
  (1, 42065)	0.215356995291286
  (1, 20247)	0.2092063106542541
  (2, 44304)	0.2156572940066389
  (2, 46204)	0.2156572940066389
  (2, 17662)	0.2156572940066389
  (2, 43675)	0.2156572940066389
  (2, 43131)	0.2156572940066389
  :	:
  (7611, 10313)	0.1455091437958984
  (7611, 59433)	0.1919059039448603
  (7611, 33315)	0.1687991910410005
  (7611, 37360)	0.14291770487104952
  (7611, 32824)	0.1317839276032016
  (7611, 50465)	0.18

In [70]:
X_train, X_val, y_train, y_val = train_test_split(train_features, train_data['target'], stratify=train_data['target'], test_size=0.2, random_state=42)

In [71]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)

In [72]:
predictions = model.predict(test_features)

submission = pd.DataFrame({'id': test_data['id'], 'target': predictions})
submission.to_csv('my_submission.csv', index=False)

predicted_labels = submission['target']
accuracy = accuracy_score(test_labels, predicted_labels)
print(f'Accuracy score: {accuracy}')

Accuracy score: 0.9834508121360711
