# Keywords #

Se va a utilizar diferentes métodos de Term Frequency e Inverse Document Frequency para comparar las palabras claves que genera
![image.png](attachment:image.png)

In [53]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# Load the CSV file
df = pd.read_csv('spanishpreprocessed_texts.csv')

In [54]:
import os

# Create a folder called "keywords_tf_idf"
os.makedirs("keywords_tf_idf", exist_ok=True)
print("Folder 'keywords_tf_idf' created.")

Folder 'keywords_tf_idf' created.


## TF_IDF ##

In [55]:
import csv

# Assuming the text is in the second column
text_column = df.columns[1]

# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(df[text_column])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Get the TF-IDF values for each document (row)
tfidf_values = X.toarray()

# Find the most common words (keywords)
keyword_counts = {}
for i, row in enumerate(tfidf_values):
    for j, value in enumerate(row):
        word = feature_names[j]
        if word in keyword_counts:
            keyword_counts[word] += value
        else:
            keyword_counts[word] = value

# Sort the keywords by their total TF-IDF score
keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)

# Save the top 70 keywords to a CSV file

with open('keywords_tf_idf/top_keywords_tf_idf.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Keyword', 'Score'])
    for word, score in keywords[:70]:
        writer.writerow([word, score])

print("Top keywords saved to 'top_keywords_tf_idf.txt'.")

Top keywords saved to 'top_keywords_tf_idf.txt'.


## TF ##

In [56]:
# Assuming the text is in the second column
text_column = df.columns[1]

# Calculate term frequencies
all_text = ' '.join(df[text_column])
word_counts = Counter(all_text.split())

# Sort the keywords by their term frequency
keywords = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Save the top 70 keywords to a CSV file
with open('keywords_tf_idf/top_keywords_tf.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Keyword', 'Score'])
    for word, count in keywords[:70]:
        writer.writerow([word, count])

print("Top keywords saved to 'top_keywords_tf.txt'.")

Top keywords saved to 'top_keywords_tf.txt'.


## TF_LOG_IDF ##

In [57]:
# Assuming the text is in the second column
text_column = df.columns[1] 

# Create the TF-IDF vectorizer with logarithm TF and normal IDF
vectorizer = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)

# Fit and transform the text data
X = vectorizer.fit_transform(df[text_column])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Get the TF-IDF values for each document (row)
tfidf_values = X.toarray()

# Find the most common words (keywords)
keyword_counts = {}
for i, row in enumerate(tfidf_values):
    for j, value in enumerate(row):
        word = feature_names[j]
        if word in keyword_counts:
            keyword_counts[word] += value
        else:
            keyword_counts[word] = value

# Sort the keywords by their total TF-IDF score
keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)

# Save the top 70 keywords to a CSV file
with open('keywords_tf_idf/top_keywords_tf_log_idf.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Keyword', 'Score'])
    for word, score in keywords[:70]:
        writer.writerow([word, score])

print("Top keywords saved to 'top_keywords_tf_log_idf.txt'.")

Top keywords saved to 'top_keywords_tf_log_idf.txt'.


## TF_LOG_IDF_PROB ##

In [58]:
# Assuming the text is in the second column
text_column = df.columns[1]

# Create the TF-IDF vectorizer with logarithm TF and probabilistic IDF
vectorizer = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=True)

# Fit and transform the text data
X = vectorizer.fit_transform(df[text_column])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Get the TF-IDF values for each document (row)
tfidf_values = X.toarray()

# Find the most common words (keywords)
keyword_counts = {}
for i, row in enumerate(tfidf_values):
    for j, value in enumerate(row):
        word = feature_names[j]
        if word in keyword_counts:
            keyword_counts[word] += value
        else:
            keyword_counts[word] = value

# Sort the keywords by their total TF-IDF score
keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)

# Save the top 70 keywords to a CSV file
with open('keywords_tf_idf/top_keywords_tf_log_idf_prob.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Keyword', 'Score'])
    for word, score in keywords[:70]:
        writer.writerow([word, score])

print("Top keywords saved to 'top_keywords_tf_log_idf_prob.txt'.")

Top keywords saved to 'top_keywords_tf_log_idf_prob.txt'.


## TF_LOG ##

In [59]:
# Assuming the text is in the second column
text_column = df.columns[1]

# Create the TF-IDF vectorizer with logarithm TF and no IDF
vectorizer = TfidfVectorizer(norm='l2', use_idf=False, smooth_idf=False, sublinear_tf=True)

# Fit and transform the text data
X = vectorizer.fit_transform(df[text_column])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Get the TF-IDF values for each document (row)
tfidf_values = X.toarray()

# Find the most common words (keywords)
keyword_counts = {}
for i, row in enumerate(tfidf_values):
    for j, value in enumerate(row):
        word = feature_names[j]
        if word in keyword_counts:
            keyword_counts[word] += value
        else:
            keyword_counts[word] = value

# Sort the keywords by their total TF-IDF score
keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)

# Save the top 70 keywords to a CSV file
with open('keywords_tf_idf/top_keywords_tf_log.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Keyword', 'Score'])
    for word, score in keywords[:70]:
        writer.writerow([word, score])

print("Top keywords saved to 'top_keywords_tf_log.txt'.")

Top keywords saved to 'top_keywords_tf_log.txt'.


## TF_AUG_IDF ##

In [60]:
# Assuming the text is in the second column
text_column = df.columns[1]

# Create the TF-IDF vectorizer with augmented TF and normal IDF
vectorizer = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False, ngram_range=(1, 1))

# Fit and transform the text data
X = vectorizer.fit_transform(df[text_column])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Get the TF-IDF values for each document (row)
tfidf_values = X.toarray()

# Find the most common words (keywords)
keyword_counts = {}
for i, row in enumerate(tfidf_values):
    for j, value in enumerate(row):
        word = feature_names[j]
        if word in keyword_counts:
            keyword_counts[word] += value
        else:
            keyword_counts[word] = value

# Sort the keywords by their total TF-IDF score
keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)

# Save the top 70 keywords to a CSV file
with open('keywords_tf_idf/top_keywords_tf_aug_idf.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Keyword', 'Score'])
    for word, score in keywords[:70]:
        writer.writerow([word, score])

print("Top keywords saved to 'top_keywords_tf_aug_idf.txt'.")

Top keywords saved to 'top_keywords_tf_aug_idf.txt'.


## TF_AUG_IDF_PROB

In [61]:
# Assuming the text is in the second column
text_column = df.columns[1]

# Create the TF-IDF vectorizer with augmented TF and probabilistic IDF
vectorizer = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=False, ngram_range=(1, 1))

# Fit and transform the text data
X = vectorizer.fit_transform(df[text_column])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Get the TF-IDF values for each document (row)
tfidf_values = X.toarray()

# Find the most common words (keywords)
keyword_counts = {}
for i, row in enumerate(tfidf_values):
    for j, value in enumerate(row):
        word = feature_names[j]
        if word in keyword_counts:
            keyword_counts[word] += value
        else:
            keyword_counts[word] = value

# Sort the keywords by their total TF-IDF score
keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)

# Save the top 70 keywords to a CSV file
with open('keywords_tf_idf/top_keywords_tf_aug_idf_prob.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Keyword', 'Score'])
    for word, score in keywords[:70]:
        writer.writerow([word, score])

print("Top keywords saved to 'top_keywords_tf_aug_idf_prob.txt'.")

Top keywords saved to 'top_keywords_tf_aug_idf_prob.txt'.


## TF_AUG ##

In [62]:
# Assuming the text is in the second column
text_column = df.columns[1]

# Create the TF-IDF vectorizer with augmented TF and no IDF
vectorizer = TfidfVectorizer(norm='l2', use_idf=False, smooth_idf=False, sublinear_tf=False, ngram_range=(1, 1))

# Fit and transform the text data
X = vectorizer.fit_transform(df[text_column])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Get the TF-IDF values for each document (row)
tfidf_values = X.toarray()

# Find the most common words (keywords)
keyword_counts = {}
for i, row in enumerate(tfidf_values):
    for j, value in enumerate(row):
        word = feature_names[j]
        if word in keyword_counts:
            keyword_counts[word] += value
        else:
            keyword_counts[word] = value

# Sort the keywords by their total TF-IDF score
keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)

# Save the top 70 keywords to a CSV file
with open('keywords_tf_idf/top_keywords_tf_aug.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Keyword', 'Score'])
    for word, score in keywords[:70]:
        writer.writerow([word, score])

print("Top keywords saved to 'top_keywords_tf_aug_.txt'.")

Top keywords saved to 'top_keywords_tf_aug_.txt'.


## TF_BOOL_IDF ##

In [63]:
# Assuming the text is in the second column
text_column = df.columns[1]

# Create the TF-IDF vectorizer with boolean TF and normal IDF
vectorizer = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False, binary=True)

# Fit and transform the text data
X = vectorizer.fit_transform(df[text_column])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Get the TF-IDF values for each document (row)
tfidf_values = X.toarray()

# Find the most common words (keywords)
keyword_counts = {}
for i, row in enumerate(tfidf_values):
    for j, value in enumerate(row):
        word = feature_names[j]
        if word in keyword_counts:
            keyword_counts[word] += value
        else:
            keyword_counts[word] = value

# Sort the keywords by their total TF-IDF score
keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)

# Save the top 70 keywords to a CSV file
with open('keywords_tf_idf/top_keywords_tf_boolean_idf.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Keyword', 'Score'])
    for word, score in keywords[:70]:
        writer.writerow([word, score])

print("Top keywords saved to 'top_keywords_tf_boolean_idf.txt'.")

Top keywords saved to 'top_keywords_tf_boolean_idf.txt'.


## TF_BOOL_IDF_PROB ##

In [64]:
# Assuming the text is in the second column
text_column = df.columns[1]

# Create the TF-IDF vectorizer with boolean TF and probabilistic IDF
vectorizer = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=False, binary=True)

# Fit and transform the text data
X = vectorizer.fit_transform(df[text_column])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Get the TF-IDF values for each document (row)
tfidf_values = X.toarray()

# Find the most common words (keywords)
keyword_counts = {}
for i, row in enumerate(tfidf_values):
    for j, value in enumerate(row):
        word = feature_names[j]
        if word in keyword_counts:
            keyword_counts[word] += value
        else:
            keyword_counts[word] = value

# Sort the keywords by their total TF-IDF score
keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)

# Save the top 70 keywords to a CSV file
with open('keywords_tf_idf/top_keywords_tf_boolean_idf_prob.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Keyword', 'Score'])
    for word, score in keywords[:70]:
        writer.writerow([word, score])

print("Top keywords saved to 'top_keywords_tf_boolean_idf_prob.txt'.")

Top keywords saved to 'top_keywords_tf_boolean_idf_prob.txt'.


## TF_BOOL ##

In [65]:
# Assuming the text is in the second column
text_column = df.columns[1]

# Create the TF-IDF vectorizer with boolean TF and no IDF
vectorizer = TfidfVectorizer(norm='l2', use_idf=False, smooth_idf=False, sublinear_tf=False, binary=True)

# Fit and transform the text data
X = vectorizer.fit_transform(df[text_column])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Get the TF-IDF values for each document (row)
tfidf_values = X.toarray()

# Find the most common words (keywords)
keyword_counts = {}
for i, row in enumerate(tfidf_values):
    for j, value in enumerate(row):
        word = feature_names[j]
        if word in keyword_counts:
            keyword_counts[word] += value
        else:
            keyword_counts[word] = value

# Sort the keywords by their total TF-IDF score
keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)

# Save the top 70 keywords to a CSV file
with open('keywords_tf_idf/top_keywords_tf_boolean.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Keyword', 'Score'])
    for word, score in keywords[:70]:
        writer.writerow([word, score])

print("Top keywords saved to 'top_keywords_tf_boolean.txt'.")

Top keywords saved to 'top_keywords_tf_boolean.txt'.


In [66]:
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Define the folder path
folder_path = "keywords_tf_idf"

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Initialize the scaler
scaler = MinMaxScaler()

# Process each CSV file
for file in csv_files:
    if file == "top_keywords_tf.csv":
        continue
    
    file_path = os.path.join(folder_path, file)
    
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Delete rows where the first column is "student" or "course"
    df = df[~df['Keyword'].isin(['estudiante', 'curso', 'estudiant'])]
    
    # Normalize the "Score" column
    df['NScore'] = scaler.fit_transform(df[['Score']])
    
    # Save the updated dataframe back to the CSV file
    df.to_csv(file_path, index=False)

print("Normalization complete and saved to CSV files.")

Normalization complete and saved to CSV files.


In [67]:
from collections import defaultdict

# Initialize a dictionary to store the count of each keyword
keyword_count = defaultdict(int)

# Process each CSV file
for file in csv_files:
    if file == "top_keywords_tf.csv":
        continue
    file_path = os.path.join(folder_path, file)
    
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Count the occurrences of each keyword
    for keyword in df['Keyword']:
        keyword_count[keyword] += 1

# Print the count of each keyword
for keyword, count in keyword_count.items():
    if count < 10:
        print(f"{keyword}: {count}")

educativo: 7
programacion: 6
aprender: 7
estrategia: 9
matematica: 7
caso: 5
social: 3
evaluacion: 3
crear: 4
conocimiento: 1
disenar: 3
basico: 2
ingenieria: 1
integral: 2


In [68]:
from collections import defaultdict

# Initialize a dictionary to store the sum of NScores for each keyword
keyword_nscore_sum = defaultdict(float)

# Process each CSV file
for file in csv_files:
    file_path = os.path.join(folder_path, file)

    if file == "top_keywords_tf.csv":
        continue
    
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Sum the NScores for each keyword
    for _, row in df.iterrows():
        keyword_nscore_sum[row['Keyword']] += row['NScore']

# Convert the dictionary to a DataFrame
final_df = pd.DataFrame(list(keyword_nscore_sum.items()), columns=['Keyword', 'NScore'])

# Sort the DataFrame by NScore in descending order
final_df = final_df.sort_values(by='NScore', ascending=False)

# Save the result to a new CSV file
final_df.to_csv('keywords_final.csv', index=False)

print("Final keywords saved to 'keywords_final.csv'.")

Final keywords saved to 'keywords_final.csv'.
