In [24]:
import re
import string
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

df = pd.read_csv('TRAINING_DATA.txt', delimiter='\t')   


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emin.sen\AppData\Roaming\nltk_data...


In [27]:
df.columns = ['label', 'sentence']

In [17]:
df.head()

Unnamed: 0,label,sentence
0,0,Hwang habló en Sur de este año por Southwest M...
1,1,Usted podría pensar Katy Perry y Robert Pattin...
2,1,Cualquiera que haya volado los cielos del crea...
3,1,"Bueno , este cantante tendrá un LARGO tiempo p..."
4,0,"Ya en octubre de 1940 , se registra , Hergé re..."


In [28]:
stop_words = set(stopwords.words('spanish'))
lemmatizer = WordNetLemmatizer()

In [29]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['sentence'] = df['sentence'].apply(preprocess_text)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(df['sentence'], df['label'], test_size=0.2, random_state=42)


In [31]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])


In [32]:
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_df': [0.75, 1.0],
    'tfidf__min_df': [1, 5],
    'clf__alpha': [0.1, 1.0, 10],
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [34]:
print(f"Best parameters: {grid_search.best_params_}")


Best parameters: {'clf__alpha': 10, 'tfidf__max_df': 0.75, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 1)}


In [37]:
y_pred = grid_search.predict(X_test)


In [38]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.4324958123953099
Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.47      0.46      1519
           1       0.42      0.39      0.40      1466

    accuracy                           0.43      2985
   macro avg       0.43      0.43      0.43      2985
weighted avg       0.43      0.43      0.43      2985



In [35]:
y_pred = grid_search.predict(X_test)


In [18]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [39]:
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

In [20]:
y_pred = clf.predict(X_test_tfidf)


In [21]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [23]:
print(f"Accuracy: {accuracy}")
print("Classification Report:")


Accuracy: 0.3611390284757119
Classification Report:


In [None]:
from bs4 import BeautifulSoup
def clean_html(html):
    # First we remove inline JavaScript/CSS:
    cleaned = html.strip()
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", " ", cleaned)
    
    # Then we remove html comments. This has to be done before removing regular
    # tags since comments can contain '>' characters.
    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", " ", cleaned)
    
    # Next we can remove the remaining tags:
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)   
    

    return cleaned.strip()

print("\n--- Cleaned text:---" )
print(clean_html(test_string))

from bs4 import BeautifulSoup
print(BeautifulSoup(test_string).get_text())