In [1]:
!pip install transformers
from transformers import BertTokenizer, BertModel, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.1-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m91.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
Col

In [3]:
import torch
import pandas as pd

from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
df = pd.read_csv('/content/drive/MyDrive/train.csv')
df, df_valid = df[:80], df[80:]
df_test = pd.read_csv('/content/drive/MyDrive/test.csv')

In [12]:
df.head()

Unnamed: 0,id,keyword,location,text,target,tokenized_text,parsed_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[101, 3458, 9115, 3680, 1132, 1103, 21642, 110...",our deeds are the reason of this earthquake ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[101, 4089, 1783, 1485, 2001, 6413, 2176, 1778...",forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,"[101, 1398, 3159, 1455, 1106, 112, 7890, 1107,...",all residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[101, 1492, 117, 1288, 1234, 3531, 108, 4098, ...",13000 people receive wildfires evacuation orde...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[101, 2066, 1400, 1850, 1142, 6307, 1121, 1137...",just got sent this photo from ruby alaska as s...


In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
df['tokenized_text'] = df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
df.head()

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Unnamed: 0,id,keyword,location,text,target,tokenized_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[101, 3458, 9115, 3680, 1132, 1103, 21642, 110..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[101, 4089, 1783, 1485, 2001, 6413, 2176, 1778..."
2,5,,,All residents asked to 'shelter in place' are ...,1,"[101, 1398, 3159, 1455, 1106, 112, 7890, 1107,..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[101, 1492, 117, 1288, 1234, 3531, 108, 4098, ..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[101, 2066, 1400, 1850, 1142, 6307, 1121, 1137..."


In [6]:
import re

def custom_tok(input_text: str) -> str:
  punctuation_pattern = r'[^\w\s]'
  input_text_ = re.sub(punctuation_pattern, '', input_text)

  list_ = input_text_.split(" ")
  list_ = [word.lower() for word in list_]
  return " ".join(list_)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

df['parsed_text'] = df.text.apply(lambda x: custom_tok(x))
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df.parsed_text)
tfidf_transformer = TfidfTransformer()
tf_transformer = tfidf_transformer.fit_transform(X_train_counts)


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(class_weight = 'balanced',max_iter=5000)),
    ])
text_clf.fit(df['parsed_text'], df['target'])

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2),(1, 3)],
    'tfidf__use_idf': (True, False),
    'clf__penalty': (None,'l2'),
    'clf__C' : (0.1, 0.5, 1, 1.5, 2, 2.5, 3)

    }

gs_clf = GridSearchCV(text_clf, parameters, cv=5, scoring='f1',n_jobs=-1)
gs_clf = gs_clf.fit(df['parsed_text'], df['target'])

predicted = gs_clf.predict(df_valid.parsed_text)
f1_score(df_valid.target,predicted)

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range = (1, 2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', LogisticRegression(class_weight = 'balanced',max_iter=5000, C=3,)),
    ])
text_clf.fit(train_df.text, train_df.target)
submission_df['target']=gs_clf.predict(df_test.text)
submission_df.to_csv('submission.csv',index=False)