In [None]:

!pip install -q scikit-learn pandas


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from google.colab import drive


print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted successfully.")


print("Loading dataset from Google Drive...")
file_path = '/content/drive/My Drive/train3.csv'
data = pd.read_csv(file_path)
print(f"Dataset loaded successfully with {data.shape[0]} rows and {data.shape[1]} columns.")


print("Checking for missing or infinite values in 'category' column:")
print(data['category'].isnull().sum())
print((data['category'] == float('inf')).sum())


print("Dropping rows with missing 'category' values...")
data = data.dropna(subset=['category'])
print(f"After cleaning, dataset has {data.shape[0]} rows.")


print("Converting 'category' column to integer...")
data['category'] = data['category'].astype(int)


print("Checking for missing or empty 'Text' data:")
print(data['Text'].isnull().sum())
print((data['Text'] == '').sum())

print("Removing rows with missing or empty 'Text' data...")
data = data.dropna(subset=['Text'])
data = data[data['Text'].str.strip() != '']
print(f"After cleaning, dataset has {data.shape[0]} rows.")


data_sample = data.sample(frac=0.8, random_state=42)
print(f"Selected 20% of the data, now {data_sample.shape[0]} rows.")


print("Splitting data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    data_sample['Text'], data_sample['category'], test_size=0.2, random_state=42
)
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")


print("Transforming text data into TF-IDF vectors...")
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
print("Text data transformed successfully.")


print("Training SVM model...")
svm_model = SVC(kernel="linear", probability=True)
svm_model.fit(X_train_tfidf, y_train)
print("SVM model training completed.")


print("Making predictions on the test set...")
y_pred = svm_model.predict(X_test_tfidf)

print("Evaluation results:")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


print("Saving predictions to Google Drive...")
test_results = pd.DataFrame({
    "Text": X_test,
    "Predicted Sentiment": y_pred
})


output_path = '/content/drive/My Drive/test_results.csv'
test_results.to_csv(output_path, index=False)
print(f"Test results saved to: {output_path}")


Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.
Loading dataset from Google Drive...
Dataset loaded successfully with 162980 rows and 2 columns.
Checking for missing or infinite values in 'category' column:
8698
0
Dropping rows with missing 'category' values...
After cleaning, dataset has 154282 rows.
Converting 'category' column to integer...
Checking for missing or empty 'Text' data:
4
0
Removing rows with missing or empty 'Text' data...
After cleaning, dataset has 154277 rows.
Selected 20% of the data, now 30855 rows.
Splitting data into training and test sets...
Training set size: 24684, Test set size: 6171
Transforming text data into TF-IDF vectors...
Text data transformed successfully.
Training SVM model...
SVM model training completed.
Making predictions on the test set...
Evaluation results:
Classification Report:
              precision    recall  f1-score   support

          -1       0.88      0.73      0.80      1357
           0       

In [None]:
print("Loading test dataset (test3.csv) from Google Drive...")
test_file_path = '/content/drive/My Drive/test3.csv'
test_data = pd.read_csv(test_file_path)
print(f"Test dataset loaded successfully with {test_data.shape[0]} rows and {test_data.shape[1]} columns.")

print("Checking for missing or empty 'Text' data in test3.csv:")


print("Removing rows with missing or empty 'Text' data in test3.csv...")
test_data = test_data.dropna(subset=['Text'])
test_data = test_data[test_data['Text'].str.strip() != '']
print(f"After cleaning, test dataset has {test_data.shape[0]} rows.")


print("Transforming test data into TF-IDF vectors...")
tfidf = TfidfVectorizer(max_features=5000)
X_test_tfidf = tfidf.fit_transform(test_data['Text'])
print("Test data transformed successfully.")


print("Making predictions on the test dataset (test3.csv)...")
y_pred = svm_model.predict(X_test_tfidf)

print("Saving predictions to Google Drive...")
test_results = pd.DataFrame({
    "Text": test_data['Text'],
    "Predicted Sentiment": y_pred
})

output_path = '/content/drive/My Drive/test3_predictions_svm.csv'
test_results.to_csv(output_path, index=False)
print(f"Test results saved to: {output_path}")

Loading test dataset (test3.csv) from Google Drive...
Test dataset loaded successfully with 12981 rows and 2 columns.
Checking for missing or empty 'Text' data in test3.csv:
3
0
Removing rows with missing or empty 'Text' data in test3.csv...
After cleaning, test dataset has 12978 rows.
Transforming test data into TF-IDF vectors...
Test data transformed successfully.
Making predictions on the test dataset (test3.csv)...
Saving predictions to Google Drive...
Test results saved to: /content/drive/My Drive/test3_predictions_svm.csv
