**Step by step for the Project NLP**

**1. Data Preprocessing -** *cleaning and preparing data*:
- Remove all the special characters
- Remove numbers
- Remove all single characters
- Remove single characters from the start
- Substitute multiple spaces with single space
- Convert to Lowercase
- Remove stopwords
- Lemmatization

**2. Feature extraction -** *Transforming the text data into numerical features*
- Using Sentence-Transformers extraction technique

**3. Model Selection -** *Choosing an appropriate machine learning model*
- Logistic Regression Model (good to predict the probability of an event occurring)

**4. Model Training -** *Training the models on the prepared data*

**5. Model Evaluation -** *Assessing the performance of the models*

**6. Prediction -** *Preparing the model to classify unseen data*

**7. Predict in a new given dataset** *Tagging new and unlabeled data*


In [1]:
#!pip install sentence-transformers

import pandas as pd
import nltk 
import re 

import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sentence_transformers import SentenceTransformer

#Needed packages:
nltk.download('stopwords')
nltk.download('wordnet')


  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aanas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aanas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Read training data and test data

training_data = pd.read_csv("TRAINING_DATA.txt", sep='\t', header=None, names=['label','sentence'])

display(training_data.head())

Unnamed: 0,label,sentence
0,1,"Cuando conocí a Janice en 2013 , una familia n..."
1,0,Hwang habló en Sur de este año por Southwest M...
2,1,Usted podría pensar Katy Perry y Robert Pattin...
3,1,Cualquiera que haya volado los cielos del crea...
4,1,"Bueno , este cantante tendrá un LARGO tiempo p..."


In [3]:
#Quick check for missing values
training_data.isna().any()

#No NaN values


label       False
sentence    False
dtype: bool

In [4]:
#Function for preprocessing

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words(['spanish']))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove all special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove all single characters
    text = re.sub(r'\b[a-zA-Z]\b', '', text)
    
    # Remove single characters from the start
    text = re.sub(r'^[a-zA-Z]\s', '', text)
    
    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text).strip()

    #Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)

    #Lemmatization
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    return text

In [5]:
#Applying the preprocessing fucntion to the training data

training_data['cleaned_sentence'] = training_data['sentence'].apply(preprocess_text)

training_data.head()

Unnamed: 0,label,sentence,cleaned_sentence
0,1,"Cuando conocí a Janice en 2013 , una familia n...",conoc janice familia necesitaba puntos promedi...
1,0,Hwang habló en Sur de este año por Southwest M...,hwang habl sur ao southwest music and medium c...
2,1,Usted podría pensar Katy Perry y Robert Pattin...,usted podra pensar katy perry robert pattinson...
3,1,Cualquiera que haya volado los cielos del crea...,cualquiera volado cielos creador escuchado act...
4,1,"Bueno , este cantante tendrá un LARGO tiempo p...",bueno cantante tendr largo tiempo sentir an m ...


In [6]:
#Splitting data into training and test sets
x = training_data['sentence']
y = training_data['label']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

In [7]:
#Using sentencetransformer for embeddings
sentence_model = SentenceTransformer('hiiamsid/sentence_similarity_spanish_es')

#transforming the training and test data
x_train_embeddings = sentence_model.encode(x_train.tolist())
x_test_embeddings = sentence_model.encode(x_test.tolist())



In [8]:
#Initializing and training the logistic regression
model = LogisticRegression(max_iter=1000, random_state=42) 

#model.fit(x_train_tfidf, y_train)
model.fit(x_train_embeddings, y_train)

In [9]:
#evaluating the model on test data
#y_pred = model.predict(x_test_tfidf)
y_pred = model.predict(x_test_embeddings)

#printing classification report
report=classification_report(y_test, y_pred, output_dict=True)
print("Classification Report:")
print(f"Precision: {report['weighted avg']['precision']}")
print(f"Recall: {report['weighted avg']['recall']}")
print(f"F1-Score: {report['weighted avg']['f1-score']}")

#accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Classification Report:
Precision: 0.6725024576721899
Recall: 0.6716917922948074
F1-Score: 0.6714577673745435
Accuracy: 0.6716917922948074


In [10]:
# Reading the real data
real_data = pd.read_csv("REAL_DATA.txt", sep='\t', header=None, names=['label', 'sentence'], on_bad_lines='skip', engine='python')

# Applying the preprocessing function to the real data
real_data['cleaned_sentence'] = real_data['sentence'].apply(preprocess_text)

# Transforming the real data using the SentenceTransformer
real_data_embeddings = sentence_model.encode(real_data['cleaned_sentence'])

# Predicting the tags for the real data
real_data['predicted_label'] = model.predict(real_data_embeddings)

# Saving the results to a new CSV file
real_data[['predicted_label','sentence']].to_csv('REAL_DATA_PREDICTED.csv', index=False)

print("Predictions saved to REAL_DATA_PREDICTED.csv")

Predictions saved to REAL_DATA_PREDICTED.csv
