In [17]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [9]:
#Load and read dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [10]:
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/georgeey/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [19]:
#Preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()

        # Remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)

        # Remove stopwords
        text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

# Apply preprocessing to the training set
train_df['preprocessed_text'] = train_df['text'].apply(preprocess_text).fillna("")

In [20]:
# Feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_df['preprocessed_text'])
y = train_df['sentiment']

In [22]:
# Train the model
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [23]:
# Evaluate the model
y_pred_train = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)

y_pred_valid = model.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, y_pred_valid)

In [24]:
print("Training Accuracy:", train_accuracy)
print("Validation Accuracy:", valid_accuracy)


Training Accuracy: 0.827419941775837
Validation Accuracy: 0.6891031471711843


In [26]:
# Sentiment analysis on the test set
sia = SentimentIntensityAnalyzer()

predictions = []

for idx, row in test_df.iterrows():
    text_id = row['textID']
    tweet_text = row['text']
    
    preprocessed_text = preprocess_text(tweet_text)
    
    sentiment_scores = sia.polarity_scores(preprocessed_text)
    
    if sentiment_scores['compound'] >= 0.05:
        sentiment_label = 'positive'
    elif sentiment_scores['compound'] <= -0.05:
        sentiment_label = 'negative'
    else:
        sentiment_label = 'neutral'
    
    selected_text = tweet_text if sentiment_label == 'neutral' else tweet_text
    
    predictions.append((text_id, selected_text))


In [27]:

# Create submission file
submission_df = pd.DataFrame(predictions, columns=['textID', 'selected_text'])
submission_df.to_csv('submission.csv', index=False)