In [12]:
import pandas as pd
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/celinewu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
# Load spaCy model 
nlp_es = spacy.load('es_core_news_sm')
nlp_en = spacy.load('en_core_web_sm')

# Load the dataset
df = pd.read_excel("/Users/celinewu/Documents/GitHub/2024-25c-fai2-adsai-group-group16/Data/transcribed_data_whisper.xlsx")

# Translate sentences to English using spaCy
def translate_to_english(sentence):
    doc = nlp_es(sentence)
    return ' '.join([token.text for token in doc]) 

df['Sentence_English'] = df['Sentence'].apply(translate_to_english)

# POS Tagging
def pos_tagging(sentence):
    doc = nlp_en(sentence)
    return ' '.join([token.pos_ for token in doc])
df['POS_Tags'] = df['Sentence_English'].apply(pos_tagging)

# TF-IDF Calculation
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Sentence_English'])
df['TF_IDF'] = list(tfidf_matrix.toarray())

# Sentiment Analysis
def sentiment_score(sentence):
    return TextBlob(sentence).sentiment.polarity
df['Sentiment_Score'] = df['Sentence_English'].apply(sentiment_score)

# Pretrained Word Embeddings
def get_word_embedding(sentence, word_vectors):
    words = word_tokenize(sentence.lower())
    vectors = [word_vectors[word] for word in words if word in word_vectors]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)  

# Load pretrained embeddings 
word_vectors = {}  
df['Pretrained_Embeddings'] = df['Sentence_English'].apply(lambda x: get_word_embedding(x, word_vectors))

# Custom Word Embedding Model
sentences = [word_tokenize(sentence.lower()) for sentence in df['Sentence_English']]
word2vec_model = Word2Vec(sentences, vector_size=300, window=5, min_count=1, workers=4)

def get_custom_embedding(sentence, model):
    words = word_tokenize(sentence.lower())
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)
df['Custom_Embeddings'] = df['Sentence_English'].apply(lambda x: get_custom_embedding(x, word2vec_model))

# Additional Feature: Sentence Length
df['Sentence_Length'] = df['Sentence_English'].apply(lambda x: len(x.split()))

# Save to file 
df.to_csv('NLP_features.tsv', sep='\t', index=False)


In [23]:
import pandas as pd
from deep_translator import GoogleTranslator

# Load the dataset 
df = pd.read_excel("/Users/celinewu/Documents/GitHub/2024-25c-fai2-adsai-group-group16/Data/transcribed_data_whisper.xlsx")

# Initialize GoogleTranslator
translator = GoogleTranslator(source='es', target='en')

# Function to translate Spanish text to English
def translate_to_english(sentence):
    if pd.isna(sentence) or not isinstance(sentence, str):
        return ""  
    try:
        return translator.translate(sentence)
    except Exception as e:
        print(f"Translation error: {e}")
        return sentence 

# Apply translation
df['Sentence_English'] = df['Sentence'].apply(translate_to_english)

# Save the translated dataset
df.to_excel("translated_data_whisper.xlsx", index=False)

print("Translation complete. File saved as 'translated_data.xlsx'.")


Translation complete. File saved as 'translated_data.xlsx'.
