In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import string
import re
import nltk
import gensim.downloader as api

import spacy

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('TRAINING_DATA.txt', delimiter='\t')

# Rename columns for easier reference
df.columns = ['label', 'sentence']

# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text

df['sentence'] = df['sentence'].apply(preprocess_text)

# Load Spanish tokenizer and lemmatizer
nlp = api.load('es_core_news_sm')

# Tokenize and lemmatize the sentences
def tokenize_and_lemmatize(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

df['sentence'] = df['sentence'].apply(tokenize_and_lemmatize)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['sentence'], df['label'], test_size=0.3, random_state=42)

# Vectorize the sentences using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You can adjust the max_features parameter as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train the SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = svm_classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ValueError: Incorrect model/corpus name

In [None]:
# Load the real data for prediction
real_data_file = 'REAL_DATA.txt'
with open(real_data_file, 'r', encoding='utf-8') as file:
    sentences = file.readlines()

# Preprocess the real data
real_data = pd.DataFrame(sentences, columns=['sentence'])
real_data['sentence'] = real_data['sentence'].apply(preprocess_text)
real_data['sentence'] = real_data['sentence'].apply(tokenize_and_lemmatize)

# Vectorize the real data sentences using TF-IDF
X_real_tfidf = tfidf_vectorizer.transform(real_data['sentence'])

# Make predictions on the real data
real_data_predictions = svm_classifier.predict(X_real_tfidf)

# Add predictions to the real_data dataframe
real_data['predicted_label'] = real_data_predictions

# Save the final results as a CSV file
real_data.to_csv('Real_Data_Predicted2.csv', index=False)

# Print the predictions
print(real_data)
