In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mc-datathon-2025-sentiment-analysis/train.csv
/kaggle/input/mc-datathon-2025-sentiment-analysis/test.csv


In [3]:
# Download link
from IPython.display import FileLink
print("✅ Submission ready! Click below to download:")
FileLink('submission.csv')

✅ Submission ready! Click below to download:


In [2]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.naive_bayes import ComplementNB  # Better than MultinomialNB for imbalanced data
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import clone

# Enhanced preprocessing with emoji/slang handling
def advanced_clean(text):
    if not isinstance(text, str):
        return ""
    
    # Emoticon conversion
    emoji_pos = r"(?::|;|=)(?:-)?(?:\)|D|P|\(\^_\^\)|\(\^O\^\)|\(\^u\^\))"
    emoji_neg = r"(?::|;|=)(?:-)?(?:\(|/|\\|>\()"
    text = re.sub(emoji_pos, " emo_pos ", text)
    text = re.sub(emoji_neg, " emo_neg ", text)
    
    # Advanced negation handling
    text = re.sub(r"(?<!\w)(not|no|never|don't|isn't|can't|won't|didn't|wasn't)\b", " not_", text, flags=re.IGNORECASE)
    
    # Intensifier handling
    text = re.sub(r"\b(very|really|extremely|absolutely|completely|totally)\b", " intense_", text)
    
    # Clean URLs and special chars
    text = re.sub(r'http\S+|www\S+|https\S+|@\w+|#\w+', '', text)
    text = re.sub(r'[^\w\s.!?]', ' ', text)
    
    return text.lower().strip()

# Load data
train_df = pd.read_csv('/kaggle/input/mc-datathon-2025-sentiment-analysis/train.csv')
test_df = pd.read_csv('/kaggle/input/mc-datathon-2025-sentiment-analysis/test.csv')
train_df = train_df.dropna(subset=['text', 'sentiment'])
test_df = test_df.dropna(subset=['text'])

# Apply preprocessing
train_df['cleaned_text'] = train_df['text'].apply(advanced_clean)
test_df['cleaned_text'] = test_df['text'].apply(advanced_clean)

# Prepare data - smaller validation set for more training data
X_train, X_val, y_train, y_val = train_test_split(
    train_df['cleaned_text'],
    train_df['sentiment'],
    test_size=0.1,  # Only 10% validation
    random_state=42,
    stratify=train_df['sentiment']
)

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

# Feature engineering with character n-grams
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
    analyzer='char_wb',  # Character n-grams with word boundaries
    strip_accents='unicode'
)

# Base models
lr = CalibratedClassifierCV(
    LogisticRegression(
        class_weight='balanced',
        solver='saga',
        max_iter=2500,
        C=0.7,
        penalty='l2',
        random_state=42
    ),
    cv=3
)

cnb = ComplementNB(alpha=0.2)  # Works better with TF-IDF than MultinomialNB

# Stacked model
model = Pipeline([
    ('tfidf', tfidf),
    ('clf', StackingClassifier(
        estimators=[
            ('lr', lr),
            ('cnb', cnb),
            ('rf', RandomForestClassifier(
                n_estimators=150,
                max_depth=20,
                class_weight='balanced_subsample',
                random_state=42
            ))
        ],
        final_estimator=LogisticRegression(
            solver='lbfgs',
            max_iter=2000,
            C=0.8
        ),
        passthrough=True,
        n_jobs=1  # Avoid memory issues
    ))
])

# Train
print("Training high-precision model...")
model.fit(X_train, y_train_enc)

# Validate
val_preds = model.predict(X_val)
val_acc = accuracy_score(y_val_enc, val_preds)
print(f"\nValidation Accuracy: {val_acc:.5f}")
print(classification_report(y_val_enc, val_preds, target_names=le.classes_))

# Optimized prediction with class balancing
test_probs = model.predict_proba(test_df['cleaned_text'])
class_weights = np.array([1.2, 1.0, 1.15])  # [negative, neutral, positive]
final_preds = np.argmax(test_probs * class_weights, axis=1)

# Create submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'sentiment': le.inverse_transform(final_preds)
})

# Save
submission.to_csv('submission.csv', index=False)
print("\n✅ High-accuracy submission created!")
print("Prediction distribution:")
print(submission['sentiment'].value_counts(normalize=True))

Training high-precision model...

Validation Accuracy: 0.65314
              precision    recall  f1-score   support

    negative       0.62      0.61      0.62       222
     neutral       0.60      0.66      0.63       290
    positive       0.76      0.69      0.72       252

    accuracy                           0.65       764
   macro avg       0.66      0.65      0.65       764
weighted avg       0.66      0.65      0.65       764


✅ High-accuracy submission created!
Prediction distribution:
sentiment
neutral     0.342363
positive    0.331412
negative    0.326225
Name: proportion, dtype: float64
