In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import re
import string
import nltk

# Download NLTK stopwords and punkt tokenizer
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load the dataset
df = pd.read_csv('TRAINING_DATA.txt', delimiter='\t')

# Rename columns for easier reference
df.columns = ['label', 'sentence']

# Preprocess text
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('spanish')]  # Remove stopwords
    lemmatized_tokens = [lemmatize_word(token) for token in tokens]  # Lemmatize
    return ' '.join(lemmatized_tokens)

# Simple rule-based lemmatizer for Spanish
def lemmatize_word(word):
    suffixes = {
        'ar': 'ar', 'er': 'er', 'ir': 'ir',  # Infinitive
        'ado': 'ar', 'ido': 'er', 'ido': 'ir',  # Past participle
        'ando': 'ar', 'iendo': 'er', 'iendo': 'ir'  # Gerund
    }
    for suffix, lemma in suffixes.items():
        if word.endswith(suffix):
            return word[:-len(suffix)] + lemma
    return word

df['sentence'] = df['sentence'].apply(preprocess_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['sentence'], df['label'], test_size=0.2, random_state=42)

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = vectorizer.transform(X_test)

# Train a classifier
clf = SVC(kernel='linear', random_state=42)
clf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = clf.predict(X_test_tfidf)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.41943048576214403
Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.40      0.41      1519
           1       0.41      0.44      0.43      1466

    accuracy                           0.42      2985
   macro avg       0.42      0.42      0.42      2985
weighted avg       0.42      0.42      0.42      2985



In [2]:
df.head()


Unnamed: 0,label,sentence
0,0,hwang habló sur año southwest music and media ...
1,1,usted podría pensar katy perry robert pattinso...
2,1,cualquiera volar cielos creador escuchar acto ...
3,1,bueno cantante largo tiempo sentir aún remordi...
4,0,octubre registra hergé recibe carta anónima ac...
