In [3]:
!unzip data.zip

Archive:  data.zip
  inflating: test_data_post.csv      
  inflating: train_data.csv          


In [80]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.23.0


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("train_data.csv")
df.head()

Unnamed: 0,Tweets,Risk Analysis
0,interested in a job in epping nh this could be...,0.0
1,community violence cv is a public health epide...,1.0
2,cvs took a step that shows it s gearing up to ...,0.0
3,this job is now open at cvs health in henderso...,0.0
4,at cvs health we are committed to empowering m...,0.0


In [3]:
df_test = pd.read_csv("test_data_post.csv")
df_test.head()

Unnamed: 0,ID,Tweets
0,1,knowing what s right for you and your body isn...
1,2,looks like we need to boycott walmart exxonmob...
2,3,#cvshealth says of its employees are racially ...
3,4,be a part of cvshealth on our journey to trans...
4,5,juddlegum ford walmart delta deloitte nrcc lib...


In [4]:
df.isna().sum()

Tweets           0
Risk Analysis    0
dtype: int64

In [5]:
df.shape

(20204, 2)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20204 entries, 0 to 20203
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Tweets         20204 non-null  object 
 1   Risk Analysis  20204 non-null  float64
dtypes: float64(1), object(1)
memory usage: 315.8+ KB


## EDA

In [7]:
risk_tweets = df[df['Risk Analysis'] == 1.0]
risk_tweets.head()

Unnamed: 0,Tweets,Risk Analysis
1,community violence cv is a public health epide...,1.0
5,davidha ndbisceglia hokiepharm cvshealth ftc ...,1.0
7,let s ask cvshealth since caremark dropped ap...,1.0
9,astraea muse shoesg nandoodles reedgalen aure...,1.0
11,this is the kind of positive affirmation we al...,1.0


In [8]:
risk_tweets.shape[0] / df.shape[0]

0.31810532567808353

In [9]:
risk_tweets.sample(1).values

array([['health amp welfare fund of the united food amp commercial workers local afl cio v allied extruders inc cv complaint ',
        1.0]], dtype=object)

## Cleaning

I see some whitespaces in the beginning and end of the tweet, we'll just clean them out.

In [10]:
df['Tweets'] = df['Tweets'].str.strip()

In [11]:
df['Risk Analysis'] = df['Risk Analysis'].astype(int)

### Oversampling

We'll oversample negative examples since our dataset is imbalanced

In [12]:
def oversample_data(df):
  num_negative_samples = df[df['Risk Analysis'] == 1]
  num_positive_samples = df[df['Risk Analysis'] == 0]
  sample_diff = num_positive_samples.shape[0] - num_negative_samples.shape[0]

  print(f"Sample Size Difference: {sample_diff}")

  oversampled_negative_samples = num_negative_samples.sample(sample_diff, replace=True)
  df_copy = pd.concat([df, oversampled_negative_samples], axis=0)

  return df_copy

## Train / Test Split

We'll split our data into training and testing such that we have equal number of positive and negative sentiments in our training data

In [13]:
train_data_size = int(df.shape[0] * 0.9)

train_data = df.iloc[0: train_data_size]
test_data = df.iloc[train_data_size:]

In [14]:
train_data.shape, test_data.shape

((18183, 2), (2021, 2))

In [15]:
train_data['Risk Analysis'].value_counts()

0    12427
1     5756
Name: Risk Analysis, dtype: int64

To fix the imbalance, we'll oversample our training data

In [16]:
train_data = oversample_data(train_data)

Sample Size Difference: 6671


In [17]:
train_data['Risk Analysis'].value_counts()

0    12427
1    12427
Name: Risk Analysis, dtype: int64

## Preprocessing

For the stats models, we'll convert our tweets into TFIDF vectors

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(train_data['Tweets'])

In [19]:
train_features = tfidf_vectorizer.transform(train_data['Tweets'])
train_labels = train_data['Risk Analysis']

test_features = tfidf_vectorizer.transform(test_data['Tweets'])
test_labels = test_data['Risk Analysis']

In [20]:
train_features.shape, test_features.shape

((24854, 26476), (2021, 26476))

## Modelling

We'll test few models,

- LogisticRegression, XGBClassifier, CatBoost with TFIDF and Count Vectorizer
- BERT for sentiment analysis

In [21]:
# models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# metrics
from sklearn.metrics import f1_score

In [22]:
stat_model_dict = {
    "Logistic Regression": {
        "model": LogisticRegression(),
        "f1_score": 0
    },
    "SVC": {
        "model": SVC(),
        "f1_score": 0
    },
    "RandomForestClassifier": {
        "model": RandomForestClassifier(
            n_estimators=100,
            max_depth=5,
            min_samples_split=5
          ),
        "f1_score": 0
    },
    "DecisionTreeClassifier": {
      "model": DecisionTreeClassifier(),
      "f1_score": 0
    },
    "XGBClassifier": {
        "model": XGBClassifier(),
        "f1_score": 0
    },
}

In [23]:
for key, value in stat_model_dict.items():

  print(f"Training {key}...")

  # fit the model
  model = value['model']
  model.fit(train_features, train_labels)

  # predict test set
  test_predictions = model.predict(test_features)

  # compute f1 score
  score = f1_score(test_labels, test_predictions)
  stat_model_dict[key]['f1_score'] = score
  print(f"F1 Score: {score}\n\n")

Training Logistic Regression...
F1 Score: 0.8495451364590622


Training SVC...
F1 Score: 0.8651942117288652


Training RandomForestClassifier...
F1 Score: 0.6742671009771987


Training DecisionTreeClassifier...
F1 Score: 0.7211394302848576


Training XGBClassifier...
F1 Score: 0.8333333333333334




Checking score on real Test Data

In [24]:
test_df_tweets = df_test['Tweets']
test_df_tweets_transformed = tfidf_vectorizer.transform(test_df_tweets)

In [25]:
test_predictions = stat_model_dict['SVC']['model'].predict(test_df_tweets_transformed)
test_predictions

array([0, 1, 1, ..., 0, 0, 1])

In [26]:
df_test['Risk Analysis'] = test_predictions
df_test.head()

Unnamed: 0,ID,Tweets,Risk Analysis
0,1,knowing what s right for you and your body isn...,0
1,2,looks like we need to boycott walmart exxonmob...,1
2,3,#cvshealth says of its employees are racially ...,1
3,4,be a part of cvshealth on our journey to trans...,0
4,5,juddlegum ford walmart delta deloitte nrcc lib...,0


In [27]:
df_test.drop(['Tweets'], axis=1).to_csv("submission_2.csv", index=False)

## BERT

We'll fine-tune BERT on this dataset and check the results since stats models didn't give good results.

In [28]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer

In [29]:
model_id = "distilbert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = DistilBertForSequenceClassification.from_pretrained(model_id, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Structure Dataset for Huggingface

Huggingface requires datasets to be in a specific format for its auto trainer. We'll adapt to that format and then fine tune our model. The format is as follows

```python
{
  "label": 0,
  "text" "some text that needs to be classified"
}
```

In [30]:
from datasets import Dataset

In [31]:
hf_dataset = []
for key, value in train_data.iterrows():
  hf_dataset.append({
    "text":value['Tweets'],
    "label": value['Risk Analysis']
})

In [32]:
def data_generator():
  for data in hf_dataset:
    yield data

ds = Dataset.from_generator(data_generator)

Generating train split: 0 examples [00:00, ? examples/s]

In [39]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/24854 [00:00<?, ? examples/s]

In [42]:
tokenized_datasets

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 24854
})

## Training Setup

In [66]:
import evaluate
import torch

In [40]:
training_args = TrainingArguments(output_dir="kaggle_comp_model")
metric = evaluate.load("f1")

In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer
)

In [45]:
trainer.train()

Step,Training Loss
500,0.3381
1000,0.2268
1500,0.1763
2000,0.1542
2500,0.1196
3000,0.096
3500,0.0556
4000,0.0457
4500,0.045
5000,0.0588


Step,Training Loss
500,0.3381
1000,0.2268
1500,0.1763
2000,0.1542
2500,0.1196
3000,0.096
3500,0.0556
4000,0.0457
4500,0.045
5000,0.0588


TrainOutput(global_step=9321, training_loss=0.07916047909193036, metrics={'train_runtime': 3885.2891, 'train_samples_per_second': 19.191, 'train_steps_per_second': 2.399, 'total_flos': 9877034178588672.0, 'train_loss': 0.07916047909193036, 'epoch': 3.0})

In [46]:
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [67]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

'thepbb cv thank you for protecting our health and peace of mind sweetheart #alexailacad #kdestrada #kdlex #runtome'

In [83]:
test_tweets_bert = test_data['Tweets']
tweet_bert_results = []

for tweet in test_tweets_bert.values:
  tweet_tokenized = tokenizer(tweet,return_tensors="pt").to(device)
  logits = model(**tweet_tokenized).logits.cpu()
  output = torch.argmax(logits).tolist()
  tweet_bert_results.append(output)

In [85]:
tweet_bert_results = np.array(tweet_bert_results)
f1_score(test_labels,tweet_bert_results)

0.9648466716529543

In [87]:
test_tweets_bert = df_test['Tweets']
tweet_bert_results_main_test = []

for tweet in test_tweets_bert.values:
  tweet_tokenized = tokenizer(tweet,return_tensors="pt").to(device)
  logits = model(**tweet_tokenized).logits.cpu()
  output = torch.argmax(logits).tolist()
  tweet_bert_results_main_test.append(output)

In [88]:
df_test['Risk Analysis'] = tweet_bert_results_main_test

In [90]:
df_test.drop(['Tweets'],axis=1).to_csv("submission_bert.csv",index=False)

## END