In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define preprocessing functions
def preprocess_text(text):
    # Lowercase conversion
    text = text.lower()

    # URL removal
    text = re.sub(r'http\S+', '', text)

    # Username removal
    text = re.sub(r'@\w+', '', text)

    # Hashtag removal
    text = re.sub(r'#\w+', '', text)

    # Punctuation removal
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenization
    words = word_tokenize(text)

    # Stop words removal
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

# Load DataSets
train_df_one = pd.read_csv("/content/drive/MyDrive/Sentiment_Detection/SemEval-2017-Task-4-A-B-C-using-BERT-main/SemEval-2017-Task-4-A-B-C-using-BERT-main/data/twitter-2013train-A.txt", delimiter='\t', header=None, names=['id', 'label', 'tweet'])
train_df_two = pd.read_csv("/content/drive/MyDrive/Sentiment_Detection/SemEval-2017-Task-4-A-B-C-using-BERT-main/SemEval-2017-Task-4-A-B-C-using-BERT-main/data/twitter-2016train-A.txt", delimiter='\t', header=None, names=['id', 'label', 'tweet'])
train_df_three = pd.read_csv("/content/drive/MyDrive/Sentiment_Detection/SemEval-2017-Task-4-A-B-C-using-BERT-main/SemEval-2017-Task-4-A-B-C-using-BERT-main/data/twitter-2015train-A.txt", delimiter='\t', header=None, names=['id', 'label', 'tweet'])
train_df_four = pd.read_csv("/content/drive/MyDrive/Sentiment_Detection/SemEval-2017-Task-4-A-B-C-using-BERT-main/SemEval-2017-Task-4-A-B-C-using-BERT-main/data/twitter-2014sarcasm-A.txt", delimiter='\t', header=None, names=['id', 'label', 'tweet'])
# test_data_df = pd.read_csv("/content/drive/MyDrive/Sentiment_Detection/SemEval-2017-Task-4-A-B-C-using-BERT-main/SemEval-2017-Task-4-A-B-C-using-BERT-main/data/twitter-2016test-A_final.txt", delimiter='\t', header=None, names=['label', 'tweet','id'])
test_data_df = pd.read_csv("/content/drive/MyDrive/Sentiment_Detection/Data/2017_English_final/GOLD/Subtask_A/SemEval2017-task4-test.subtask-A.english.txt", delimiter='\t', header=None, names=['id','label', 'tweet'])
# Concatenate training dataframes
train_data_df = [train_df_one, train_df_two, train_df_three, train_df_four]
train_data_df = pd.concat(train_data_df)

# Preprocess the tweets
train_data_df['tweet'] = train_data_df['tweet'].apply(preprocess_text)

train_tweet = train_data_df.tweet.values
y_train = train_data_df.label.values

test_tweet = test_data_df.tweet.values
y_test = test_data_df.label.values

# Change labels to numeric values
train_labels = []
test_labels = []
label_dict = {'negative': 0, 'neutral': 1, 'positive': 2}

for label in y_train:
    train_labels.append(label_dict[label])

for label in y_test:
    test_labels.append(label_dict[label])

print("We have {} training samples".format(len(train_tweet)))
print("We have {} test samples".format(len(test_tweet)))

# Use TfidfVectorizer to convert text to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_tweet)
X_test_tfidf = tfidf_vectorizer.transform(test_tweet)

# Train the SVM model with regularization (C parameter)
svm_classifier = SVC(kernel='linear', C=1.0)  # You can adjust the C value for regularization
svm_classifier.fit(X_train_tfidf, train_labels)

# Predict the Output
predicted = svm_classifier.predict(X_test_tfidf)

from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Find the Accuracy, precision, recall, and F1 score
test_acc = accuracy_score(test_labels, predicted)
test_f1 = metrics.f1_score(test_labels, predicted, labels=[0, 1, 2], average='macro', zero_division=1)
test_precision = precision_score(test_labels, predicted, labels=[0, 1, 2], average='macro', zero_division=1)
test_recall = recall_score(test_labels, predicted, labels=[0, 1, 2], average='macro', zero_division=1)

print(f'test_acc: {test_acc:.4f}')
print(f'f1 Score: {test_f1:.4f}')
print(f'precision: {test_precision:.4f}')
print(f'recall: {test_recall:.4f}')
