# Using TFID

In [3]:
import pandas as pd
import re
train, test = pd.read_csv("train.csv"), pd.read_csv("test.csv")

def create_features(df):
    df["nhashtags"] = df["text"].apply(lambda x: len(re.findall(r'\B#\w+', x)))
    df["nmentions"] = df["text"].apply(lambda x: len(re.findall(r'\B@\w+', x)))
    df["nnumbers"] = df["text"].apply(lambda x: len(re.findall(r'\b-?\d[\d,\.]*\b', x)))
    df["ninter"] = df["text"].apply(lambda x: x.count("?"))
    df["nexcl"] = df["text"].apply(lambda x: x.count("!"))
    return df

train = create_features(train)
test = create_features(test)


In [51]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Apply preprocessing to train and test
train['text'] = train['text'].apply(preprocess_text)
test['text'] = test['text'].apply(preprocess_text)

# Define feature processing
text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=100000))
])

num_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'text'),
        ('num', num_transformer, ['nhashtags', 'nmentions', 'nnumbers', 'ninter', 'nexcl'])
    ],
    remainder='drop'
)

# Model training pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Fit the model
model_pipeline.fit(train.drop(columns=['target']), train['target'])

# Prediction on train set for statistics
y_train_pred = model_pipeline.predict(train.drop(columns=['target']))

# Model statistics
print("Train Accuracy:", accuracy_score(train['target'], y_train_pred))
print("Train Classification Report:\n", classification_report(train['target'], y_train_pred))
print("Train Confusion Matrix:\n", confusion_matrix(train['target'], y_train_pred))

# Transform the test set with the same preprocessor
X_test_transformed = model_pipeline.named_steps['preprocessor'].transform(test)

# Prediction on test set
y_test_pred = model_pipeline.named_steps['classifier'].predict(X_test_transformed)

# Save the predictions in submission.csv
submission = pd.DataFrame({'id': test["id"].values, 'target': y_test_pred})
submission.to_csv('submission.csv', index=False)

Train Accuracy: 0.8802049126494155
Train Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.96      0.90      4342
           1       0.93      0.78      0.85      3271

    accuracy                           0.88      7613
   macro avg       0.89      0.87      0.87      7613
weighted avg       0.89      0.88      0.88      7613

Train Confusion Matrix:
 [[4156  186]
 [ 726 2545]]
Predictions saved to submission.csv


# Using BERT

In [4]:
import pandas as pd
import re
train, test = pd.read_csv("train.csv"), pd.read_csv("test.csv")

def create_features(df):
    df["nhashtags"] = df["text"].apply(lambda x: len(re.findall(r'\B#\w+', x)))
    df["nmentions"] = df["text"].apply(lambda x: len(re.findall(r'\B@\w+', x)))
    df["nnumbers"] = df["text"].apply(lambda x: len(re.findall(r'\b-?\d[\d,\.]*\b', x)))
    df["ninter"] = df["text"].apply(lambda x: x.count("?"))
    df["nexcl"] = df["text"].apply(lambda x: x.count("!"))
    return df

train = create_features(train)
test = create_features(test)

In [5]:
import pandas as pd
import re
import torch
import logging
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

# Set up logging to suppress warnings
logging.basicConfig(level=logging.INFO)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Apply preprocessing to train and test
train['text'] = train['text'].apply(preprocess_text)
test['text'] = test['text'].apply(preprocess_text)

# Tokenizer and model initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize the data
train_encodings = tokenizer(train['text'].tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test['text'].tolist(), truncation=True, padding=True, max_length=512)

# Convert to torch tensors
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels) if self.labels is not None else len(self.encodings['input_ids'])

train_dataset = Dataset(train_encodings, train['target'].tolist())

# Define feature processing for numerical features
num_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

num_features_train = num_transformer.fit_transform(train[['nhashtags', 'nmentions', 'nnumbers', 'ninter', 'nexcl']])
num_features_test = num_transformer.transform(test[['nhashtags', 'nmentions', 'nnumbers', 'ninter', 'nexcl']])

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",  # Disable reporting to suppress warnings
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Predicting on test set
test_dataset = Dataset(test_encodings)  # No labels for test
predictions = trainer.predict(test_dataset)

# Extract predictions
preds = predictions.predictions.argmax(-1)

# Save the predictions in submission.csv
submission = pd.DataFrame({'id': test["id"].values, 'target': preds})
submission.to_csv('submission.csv', index=False)

print("Predictions saved to submission.csv")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.7458
20,0.6773
30,0.693
40,0.6845
50,0.679
60,0.6575
70,0.6621
80,0.6634
90,0.62
100,0.5726


Predictions saved to submission.csv
