IMPORTING LIBRARIES

In [1]:
import pandas as pd
import re
import os
import random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer


DATA LOADING

In [2]:

data = pd.read_csv("chunk_1.csv")


DATA RESIZING

In [3]:
import os
import pandas as pd

def split_file(input_file, output_dir, chunk_size):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the input file
    df = pd.read_csv(input_file)
    
    # Calculate the total number of rows and columns
    total_rows, total_columns = df.shape
    
    # Calculate the size of each row
    row_size = df.memory_usage(deep=True).sum() // total_rows
    
    # Calculate the number of rows per chunk based on the chunk size
    rows_per_chunk = chunk_size // row_size
    
    # Calculate the number of chunks
    num_chunks = total_rows // rows_per_chunk + (1 if total_rows % rows_per_chunk != 0 else 0)
    
    # Split the DataFrame into chunks and write each chunk to a CSV file
    for i in range(num_chunks):
        start_row = i * rows_per_chunk
        end_row = min((i + 1) * rows_per_chunk, total_rows)
        chunk_df = df.iloc[start_row:end_row]
        chunk_file_path = os.path.join(output_dir, f'chunk_{i+1}.csv')
        chunk_df.to_csv(chunk_file_path, index=False)

if __name__ == "__main__":
    input_file = "enwiki-20170820.csv"  # Replace with your input file path
    output_dir = "output_chunks1"  # Output directory to save the chunks
    chunk_size = 1024 * 1024 * 1024  # 0.5 GB in bytes
    
    split_file(input_file, output_dir, chunk_size)


SAMPLE DATA

In [3]:
sample=data.head(501)
data1 = sample.copy()
data1

Unnamed: 0,ARTICLE_ID,TITLE,SECTION_TITLE,SECTION_TEXT
0,0,Anarchism,Introduction,\n\n\n\n\n\n'''Anarchism''' is a political phi...
1,0,Anarchism,Etymology and terminology,\n\nThe term ''anarchism'' is a compound word ...
2,0,Anarchism,History,\n\n===Origins===\nWoodcut from a Diggers docu...
3,0,Anarchism,Anarchist schools of thought,\nPortrait of philosopher Pierre-Joseph Proudh...
4,0,Anarchism,Internal issues and debates,\nconsistent with anarchist values is a contro...
...,...,...,...,...
496,43,Analysis of variance,Introduction,\n\n'''Analysis of variance''' ('''ANOVA''') i...
497,43,Analysis of variance,History,While the analysis of variance reached fruitio...
498,43,Analysis of variance,Motivating example,No fit.Fair fitVery good fitThe analysis of va...
499,43,Analysis of variance,Background and terminology,ANOVA is a particular form of statistical hypo...


PREPROCESSING

In [4]:
# Replace NaN values in 'SECTION_TEXT' with an empty string
data1['SECTION_TEXT'].fillna('', inplace=True)

# Group by 'ARTICLE_ID' and 'TITLE', and aggregate 'SECTION_TEXT'
data1 = data1.groupby(['ARTICLE_ID', 'TITLE'], as_index=False)['SECTION_TEXT'].agg(' '.join)

# Display the resulting DataFrame
data1


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data1['SECTION_TEXT'].fillna('', inplace=True)


Unnamed: 0,ARTICLE_ID,TITLE,SECTION_TEXT
0,0,Anarchism,\n\n\n\n\n\n'''Anarchism''' is a political phi...
1,1,Autism,\n\n\n \n\n\n\n\n'''Autism''' is a neurodevelo...
2,2,Albedo,\n\nPercentage of diffusely reflected sunlight...
3,3,A,\n\n\n\n\n\n\nWriting cursive forms of A\n'''A...
4,4,Alabama,\n\n\n\n\n\n\n'''Alabama''' () is a state in t...
5,5,Achilles,\n\nAchilles and the Nereid Cymothoe Attic red...
6,6,Abraham Lincoln,\n\n\n\n\n\n'''Abraham Lincoln''' (; February ...
7,7,Aristotle,\n\n\n\n\n'''Aristotle''' (; ''Aristotélēs''...
8,8,An American in Paris,\n\n\n\n'''''An American in Paris''''' is a ja...
9,9,Academy Award for Best Production Design,\n\n\nThe '''Academy Award for Best Production...


In [5]:

# Download NLTK resources (you only need to do this once)
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function for preprocessing text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Function to remove HTML tags
def remove_html_tags(text):
    clean_text = re.sub(r'<.*?>', '', text)  # Remove HTML tags using regex
    return clean_text

# Apply the remove_html_tags function to the SECTION_TEXT column
data1['SECTION_TEXT'] = data1['SECTION_TEXT'].apply(remove_html_tags)

# Apply preprocessing to the SECTION_TEXT column
data1['SECTION_TEXT'] = data1['SECTION_TEXT'].apply(preprocess_text)

# Fill missing ARTICLE_ID values with "text not available"
data1['ARTICLE_ID'].fillna("text not available", inplace=True)

# Display the preprocessed data
# print(sample)


[nltk_data] Downloading package punkt to C:\Users\M.Awais
[nltk_data]     Khaleeq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\M.Awais
[nltk_data]     Khaleeq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\M.Awais
[nltk_data]     Khaleeq\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data1['ARTICLE_ID'].fillna("text not available", inplace=True)


In [6]:
data1

Unnamed: 0,ARTICLE_ID,TITLE,SECTION_TEXT
0,0,Anarchism,anarchism political philosophy advocate selfgo...
1,1,Autism,autism neurodevelopmental disorder characteriz...
2,2,Albedo,percentage diffusely reflected sunlight relati...
3,3,A,writing cursive form named plural aes first le...
4,4,Alabama,alabama state southeastern region united state...
5,5,Achilles,achilles nereid cymothoe attic redfigure kanth...
6,6,Abraham Lincoln,abraham lincoln february 12 1809 april 15 1865...
7,7,Aristotle,aristotle aristotélēs 384322 bc ancient greek ...
8,8,An American in Paris,american paris jazzinfluenced orchestral piece...
9,9,Academy Award for Best Production Design,academy award best production design recognize...


SEARCHING

In [7]:

# Function to find and return row based on ARTICLE_ID or TITLE
def find_row(query):
    if query.isdigit():  # Check if the query is an ARTICLE_ID
        result = data1[data1['ARTICLE_ID'] == int(query)]
    else:  # Assume the query is a TITLE
        result = data1[data1['TITLE'].str.contains(query, case=False)]
    
    if not result.empty:
        return result
    else:
        print("No matching ARTICLE_ID or TITLE found.")
        return None

# Accept query from user at runtime
user_query = input("Enter ARTICLE_ID: ")

# Find and display row based on user query
result_df = find_row(user_query)
if result_df is not None:
    print(result_df.to_string(index=False))


VOCABULARY

In [8]:

# Function to create vocabulary from SECTION_TEXT
def create_vocabulary(data_frame):
    vocabulary = {}
    index = 0
    for text in data_frame['SECTION_TEXT']:
        # Skip NaN values
        if pd.notna(text):
            words = word_tokenize(text)
            for word in words:
                if word not in vocabulary:
                    vocabulary[word] = index
                    index += 1
    return vocabulary

# Create vocabulary
vocabulary = create_vocabulary(data1)

# Print the vocabulary
print("Vocabulary:", vocabulary)




FILE SAPARATION

In [9]:
data1.to_csv("preprocessed_data.txt", sep='\t', index=False)


In [10]:
data1.to_csv("preprocessed_data.csv", index=False)

TF/IDF

In [10]:
# Create a new column named 'term_frequency' in your DataFrame (data1)
data1['term_frequency'] = ''

# Iterate over each row in the DataFrame
for index, row in data1.iterrows():
    # Initialize a dictionary to store the frequency of each term
    term_freq = {}
    
    # Split the text into words
    words = row['SECTION_TEXT'].split()
    
    # Count the frequency of each word in the text
    for word in words:
        term_id = vocabulary.get(word)  # Get the term ID from the vocabulary
        if term_id is not None:  # If the word is present in the vocabulary
            term_freq[term_id] = term_freq.get(term_id, 0) + 1  # Increment the frequency count
    
    # Create a list of tuples representing term ID and its frequency
    term_freq_list = [(term_id, frequency) for term_id, frequency in term_freq.items() if frequency > 0]
    
    # Assign the term frequency list to the 'term_frequency' column for the current row
    data1.at[index, 'term_frequency'] = term_freq_list


In [11]:
data1

Unnamed: 0,ARTICLE_ID,TITLE,SECTION_TEXT,term_frequency
0,0,Anarchism,anarchism political philosophy advocate selfgo...,"[(0, 111), (1, 21), (2, 13), (3, 8), (4, 1), (..."
1,1,Autism,autism neurodevelopmental disorder characteriz...,"[(2452, 209), (2453, 2), (2454, 35), (2455, 3)..."
2,2,Albedo,percentage diffusely reflected sunlight relati...,"[(3611, 1), (3760, 2), (694, 8), (3761, 6), (3..."
3,3,A,writing cursive form named plural aes first le...,"[(1466, 3), (4220, 7), (1176, 18), (3638, 1), ..."
4,4,Alabama,alabama state southeastern region united state...,"[(4493, 277), (21, 240), (4494, 6), (1094, 12)..."
5,5,Achilles,achilles nereid cymothoe attic redfigure kanth...,"[(6463, 262), (6464, 4), (6465, 1), (6466, 6),..."
6,6,Abraham Lincoln,abraham lincoln february 12 1809 april 15 1865...,"[(7825, 35), (7826, 458), (1074, 12), (2536, 7..."
7,7,Aristotle,aristotle aristotélēs 384322 bc ancient greek ...,"[(9394, 255), (9395, 1), (9396, 1), (188, 17),..."
8,8,An American in Paris,american paris jazzinfluenced orchestral piece...,"[(571, 22), (459, 24), (10341, 1), (10342, 2),..."
9,9,Academy Award for Best Production Design,academy award best production design recognize...,"[(9405, 4), (10541, 12), (964, 6), (1829, 3), ..."


In [12]:
# Initialize a dictionary to store the IDF for each term
idf_dict = {term_id: 0 for term_id in range(len(vocabulary))}

# Iterate over each term in the vocabulary
for term_id in vocabulary.values():
    # Iterate over each row in the DataFrame
    for index, row in data1.iterrows():
        # Check if the term appears in the row's term frequency list
        if any(term_id == term_freq_tuple[0] for term_freq_tuple in row['term_frequency']):
            # Increment the IDF count for the term
            idf_dict[term_id] += 1

# Convert the IDF dictionary to a list of tuples
idf_list = list(idf_dict.items())

# Display the IDF list
print(idf_list)


[(0, 2), (1, 14), (2, 14), (3, 7), (4, 1), (5, 22), (6, 36), (7, 4), (8, 15), (9, 29), (10, 17), (11, 2), (12, 29), (13, 33), (14, 19), (15, 20), (16, 11), (17, 1), (18, 14), (19, 19), (20, 19), (21, 37), (22, 1), (23, 2), (24, 3), (25, 1), (26, 23), (27, 1), (28, 4), (29, 12), (30, 2), (31, 3), (32, 3), (33, 19), (34, 15), (35, 35), (36, 21), (37, 29), (38, 23), (39, 26), (40, 11), (41, 4), (42, 3), (43, 29), (44, 2), (45, 7), (46, 10), (47, 4), (48, 1), (49, 9), (50, 4), (51, 3), (52, 1), (53, 1), (54, 1), (55, 15), (56, 7), (57, 20), (58, 5), (59, 25), (60, 28), (61, 30), (62, 20), (63, 19), (64, 1), (65, 2), (66, 33), (67, 23), (68, 15), (69, 12), (70, 1), (71, 3), (72, 20), (73, 20), (74, 11), (75, 3), (76, 6), (77, 8), (78, 2), (79, 17), (80, 3), (81, 14), (82, 13), (83, 18), (84, 1), (85, 23), (86, 2), (87, 12), (88, 30), (89, 3), (90, 24), (91, 12), (92, 2), (93, 3), (94, 2), (95, 18), (96, 12), (97, 15), (98, 16), (99, 1), (100, 23), (101, 38), (102, 24), (103, 3), (104, 1), (

In [13]:

# Iterate over each row in the DataFrame
for index, row in data1.iterrows():
    # Initialize a list to store the updated TF-IDF weights
    tfidf_weights = []
    
    # Iterate over each term frequency tuple in the row
    for term_id, term_freq in row['term_frequency']:
        # Find the IDF value for the term
        idf_value = next((idf_tuple[1] for idf_tuple in idf_list if idf_tuple[0] == term_id), 0)
        
        # Calculate TF-IDF weight for the term
        tfidf_weight = term_freq / idf_value
        tfidf_weights.append((term_id, tfidf_weight))
    
    # Assign the TF-IDF weights to the 'term_frequency' column for the current row
    data1.at[index, 'term_frequency'] = tfidf_weights

data1


Unnamed: 0,ARTICLE_ID,TITLE,SECTION_TEXT,term_frequency
0,0,Anarchism,anarchism political philosophy advocate selfgo...,"[(0, 55.5), (1, 1.5), (2, 0.9285714285714286),..."
1,1,Autism,autism neurodevelopmental disorder characteriz...,"[(2452, 209.0), (2453, 2.0), (2454, 8.75), (24..."
2,2,Albedo,percentage diffusely reflected sunlight relati...,"[(3611, 0.125), (3760, 2.0), (694, 0.888888888..."
3,3,A,writing cursive form named plural aes first le...,"[(1466, 0.16666666666666666), (4220, 7.0), (11..."
4,4,Alabama,alabama state southeastern region united state...,"[(4493, 92.33333333333333), (21, 6.48648648648..."
5,5,Achilles,achilles nereid cymothoe attic redfigure kanth...,"[(6463, 131.0), (6464, 4.0), (6465, 1.0), (646..."
6,6,Abraham Lincoln,abraham lincoln february 12 1809 april 15 1865...,"[(7825, 17.5), (7826, 152.66666666666666), (10..."
7,7,Aristotle,aristotle aristotélēs 384322 bc ancient greek ...,"[(9394, 42.5), (9395, 1.0), (9396, 1.0), (188,..."
8,8,An American in Paris,american paris jazzinfluenced orchestral piece...,"[(571, 0.6875), (459, 1.7142857142857142), (10..."
9,9,Academy Award for Best Production Design,academy award best production design recognize...,"[(9405, 0.36363636363636365), (10541, 0.923076..."


VECTORS

In [14]:
import numpy as np

# Initialize the 'vector' column with empty arrays
data1['vector'] = [np.zeros(len(idf_list)) for _ in range(len(data1))]

# Iterate over each row in the DataFrame
for index, row in data1.iterrows():
    # Initialize an array to store the vector for the current row
    row_vector = np.zeros(len(idf_list))
    
    # Update the vector using the TF-IDF weights
    for term_id, tfidf_weight in row['term_frequency']:
        row_vector[term_id] = tfidf_weight
    
    # Assign the vector to the 'vector' column for the current row
    data1.at[index, 'vector'] = row_vector

data1


Unnamed: 0,ARTICLE_ID,TITLE,SECTION_TEXT,term_frequency,vector
0,0,Anarchism,anarchism political philosophy advocate selfgo...,"[(0, 55.5), (1, 1.5), (2, 0.9285714285714286),...","[55.5, 1.5, 0.9285714285714286, 1.142857142857..."
1,1,Autism,autism neurodevelopmental disorder characteriz...,"[(2452, 209.0), (2453, 2.0), (2454, 8.75), (24...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.045454545454545456..."
2,2,Albedo,percentage diffusely reflected sunlight relati...,"[(3611, 0.125), (3760, 2.0), (694, 0.888888888...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0555555555555..."
3,3,A,writing cursive form named plural aes first le...,"[(1466, 0.16666666666666666), (4220, 7.0), (11...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0277777777777..."
4,4,Alabama,alabama state southeastern region united state...,"[(4493, 92.33333333333333), (21, 6.48648648648...","[0.0, 0.5, 0.0, 0.0, 0.0, 0.045454545454545456..."
5,5,Achilles,achilles nereid cymothoe attic redfigure kanth...,"[(6463, 131.0), (6464, 4.0), (6465, 1.0), (646...","[0.0, 0.0, 0.07142857142857142, 0.0, 0.0, 0.0,..."
6,6,Abraham Lincoln,abraham lincoln february 12 1809 april 15 1865...,"[(7825, 17.5), (7826, 152.66666666666666), (10...","[0.0, 2.142857142857143, 0.07142857142857142, ..."
7,7,Aristotle,aristotle aristotélēs 384322 bc ancient greek ...,"[(9394, 42.5), (9395, 1.0), (9396, 1.0), (188,...","[0.0, 0.35714285714285715, 1.9285714285714286,..."
8,8,An American in Paris,american paris jazzinfluenced orchestral piece...,"[(571, 0.6875), (459, 1.7142857142857142), (10...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0555555555555..."
9,9,Academy Award for Best Production Design,academy award best production design recognize...,"[(9405, 0.36363636363636365), (10541, 0.923076...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
data1.to_csv("preprocessed_data.csv", index=False)


EXTRA WORK / ERRORED CODE

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_tfidf(data):
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the text data
    tfidf_matrix = vectorizer.fit_transform(data1['SECTION_TEXT'])

    # Convert the TF-IDF matrix to a DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    return tfidf_df

# Replace NaN values in 'SECTION_TEXT' with an empty string
data1['SECTION_TEXT'].fillna('', inplace=True)

# Compute TF-IDF
tfidf_df = compute_tfidf(data1)

# Display TF-IDF DataFrame
print(tfidf_df)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data1['SECTION_TEXT'].fillna('', inplace=True)


         00       000      0000    000000  000032184      0001       001  \
0   0.00000  0.000000  0.000000  0.000000   0.000000  0.000000  0.000000   
1   0.00000  0.000000  0.000000  0.000000   0.000000  0.000000  0.000000   
2   0.00000  0.000000  0.000000  0.000000   0.000000  0.000000  0.000000   
3   0.00000  0.000000  0.000000  0.000000   0.000000  0.000000  0.000000   
4   0.00000  0.000000  0.000000  0.000000   0.000000  0.000000  0.000000   
5   0.00000  0.000000  0.000000  0.000000   0.000000  0.000000  0.000000   
6   0.00000  0.000000  0.000000  0.000000   0.000000  0.000000  0.000000   
7   0.00000  0.000000  0.000000  0.000000   0.000000  0.000000  0.000000   
8   0.00000  0.000000  0.000000  0.000000   0.000000  0.000000  0.000000   
9   0.00000  0.000000  0.000000  0.000000   0.000000  0.000000  0.000000   
10  0.00000  0.000000  0.000000  0.000000   0.000000  0.000000  0.000000   
11  0.00000  0.000000  0.000000  0.000000   0.000000  0.000000  0.000000   
12  0.00000 

In [None]:
def compute_tfidf(data):
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the text data
    tfidf_matrix = vectorizer.fit_transform(data['SECTION_TEXT'])

    # Convert the TF-IDF matrix to a DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    return tfidf_df

def find_row(query):
    if query.isdigit():  # Check if the query is an ARTICLE_ID
        result = data1[data1['ARTICLE_ID'] == int(query)]
    # else:  # Assume the query is a TITLE
        result = data1[data1['TITLE'].str.contains(query, case=False)]
    
    if not result.empty:
        return result
    else:
        print("No matching ARTICLE_ID or TITLE found.")
        return None


# Fill missing ARTICLE_ID values with "text not available"
data1['ARTICLE_ID'].fillna("text not available", inplace=True)

# Compute TF-IDF
tfidf_df = compute_tfidf(data1)

# Find and display row based on user query
result_df = find_row(user_query)
if result_df is not None:
    print(result_df.to_string(index=False))

# Display TF-IDF DataFrame
tfidf_df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data1['ARTICLE_ID'].fillna("text not available", inplace=True)


 ARTICLE_ID  TITLE                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

Unnamed: 0,00,000,0000,000000,000032184,0001,001,0010,0011,002,...,ἀχιλλεὺς,ἀχιλῆος,ἄειδε,ἄλγε,ἄχος,ἔθηκεν,ἰδείν,ὀξύς,ὠκὺς,𓆎𓅓𓏏𓊖
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003588,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003889,0.003889,0.003889,0.003889,0.003889,0.003889,0.0,0.003889,0.003889,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Read the DataFrame containing the articles data
data = pd.DataFrame({
    'ARTICLE_ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 
                   20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 
                   38, 39, 40, 41, 42, 43],
    'TITLE': ['Anarchism', 'Autism', 'Albedo', 'A', 'Alabama', 'Achilles', 'Abraham Lincoln', 
              'Aristotle', 'An American in Paris', 'Academy Award for Best Production Design', 
              'Academy Awards', 'Actrius', 'Animalia (book)', 'International Atomic Time', 
              'Altruism', 'Ayn Rand', 'Alain Connes', 'Allan Dwan', 'Algeria', 
              'List of Atlas Shrugged characters', 'Anthropology', 'Agricultural science', 
              'Alchemy', 'Alien', 'Astronomer', 'ASCII', 'Austin (disambiguation)', 
              'Animation', 'Apollo', 'Andre Agassi', 'Austroasiatic languages', 
              'Afroasiatic languages', 'Andorra', 'Arithmetic mean', 'American Football Conference', 
              'Animal Farm', 'Amphibian', 'Alaska', 'Agriculture', 'Aldous Huxley', 'Ada', 
              'Aberdeen (disambiguation)', 'Algae', 'Analysis of variance'],
    'SECTION_TEXT': ['anarchism political philosophy advocate selfgo...', 
                     'autism neurodevelopmental disorder characteriz...', 
                     'percentage diffusely reflected sunlight relati...',
                     'writing cursive form named plural aes first le...',
                     'alabama state southeastern region united state...',
                     'achilles nereid cymothoe attic redfigure kanth...',
                     'abraham lincoln february 12 1809 april 15 1865...',
                     'aristotle aristotélēs 384322 bc ancient greek ...',
                     'american paris jazzinfluenced orchestral piece...',
                     'academy award best production design recognize...',
                     'academy award known officially oscar set twent...',
                     'actress catalan actrius 1997 catalan language ...',
                     'animalia illustrated childrens book graeme bas...',
                     'international atomic time tai french name high...',
                     'giving alms poor often considered altruistic a...',
                     'ayn rand born alisa zinovyevna rosenbaum march...',
                     'alain connes born 1 april 1947 french mathemat...',
                     'allan dwan 3 april 1885 28 december 1981 pione...',
                     'algeria officially people democratic republic ...',
                     'list character ayn rand novel atlas shrugged f...',
                     'anthropology study various aspect human within...',
                     'agricultural science broad multidisciplinary f...',
                     'kimiyayi saādat alchemy happiness text islamic...',
                     'alien primarily refers extraterrestrial life l...',
                     'astronomer johannes vermeer astronomer scienti...',
                     'ascii abbreviated american standard code infor...',
                     'austin capital texas united state austin may a...',
                     'animation process making illusion motion illus...',
                     'apollo attic ionic homeric greek apollōn doric...',
                     'andre kirk agassi born april 29 1970 american ...',
                     'austroasiatic language recent classification s...',
                     'afroasiatic afroasiatic also known afrasian tr...',
                     'andorra officially principality andorra also c...',
                     'mathematics statistic arithmetic mean stress t...',
                     'american football conference afc one two confe...',
                     'animal farm allegorical novella george orwell ...',
                     'amphibian ectothermic tetrapod vertebrate clas...',
                     'alaska u state located northwest extremity nor...',
                     'field záhorie slovakiaa typical central europe...',
                     'aldous leonard huxley 26 july 1894 22 november...',
                     'ada may refer africa ada foah ada ghana town a...',
                     'aberdeen city scotland united kingdom aberdeen...',
                     'algae singular alga informal term large divers...',
                     'analysis variance anova collection statistical...']
})

# Initialize the TF/IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform documents to TF/IDF vectors
tfidf_documents = tfidf_vectorizer.fit_transform(data['SECTION_TEXT'])

# Create a run dialog box to get user input for the query
query = input("Enter your query: ")

# Transform the query to a TF/IDF vector
tfidf_query = tfidf_vectorizer.transform([query])

# Calculate cosine similarity between query and documents
similarity_scores = cosine_similarity(tfidf_documents, tfidf_query)

# Rank documents based on similarity scores
ranked_documents = sorted(zip(similarity_scores, data['TITLE']), reverse=True)

# Display ranked documents
print("\nRanked Documents:")
for score, title in ranked_documents:
    print(f"Similarity Score: {score[0]} - Title: {title}")


In [None]:
from collections import defaultdict
import math

# Define the document collection
documents = [
    "I wonder how many miles I've fallen by this time?",
    "According to the latest census, the population of Moscow is more than two million.",
    "It was a warm, bright day at the end of August.",
    "To be, or not to be?",
    "autism"
]

# Define the query
query = "the population"

# Step 1: Define the vocabulary
vocabulary = set()
for document in documents:
    terms = document.lower().split()
    vocabulary.update(terms)

# Include query terms in the vocabulary
query_terms = query.lower().split()
vocabulary.update(query_terms)

# Step 2: Assign unique IDs to terms in the vocabulary
term_to_id = {term: i for i, term in enumerate(vocabulary)}

# Step 3: Calculate TF for each term in each document and the query
tf = defaultdict(lambda: defaultdict(int))
for doc_id, document in enumerate(documents):
    terms = document.lower().split()
    for term in terms:
        term_id = term_to_id[term]
        tf[doc_id][term_id] += 1

# Calculate TF for the query
query_tf = defaultdict(int)
for term in query_terms:
    term_id = term_to_id[term]
    query_tf[term_id] += 1

# Step 4: Calculate IDF for each term
num_documents = len(documents)
idf = defaultdict(float)
for term in vocabulary:
    term_id = term_to_id[term]
    doc_freq = sum(1 for doc_id in tf if term_id in tf[doc_id])
    idf[term_id] = math.log(num_documents / (1 + doc_freq))

# Step 5: Compute TF/IDF weights for each term in each document and the query
tfidf = defaultdict(lambda: defaultdict(float))
for doc_id in tf:
    for term_id in tf[doc_id]:
        tfidf[doc_id][term_id] = tf[doc_id][term_id] * idf[term_id]

# Compute TF/IDF weights for the query
query_tfidf = defaultdict(float)
for term_id in query_tf:
    query_tfidf[term_id] = query_tf[term_id] * idf[term_id]

# Step 6: Represent documents in a sparse format (optional)
sparse_tfidf = {}
for doc_id in tfidf:
    sparse_tfidf[doc_id] = [(term_id, weight) for term_id, weight in tfidf[doc_id].items()]

# Represent the query in a sparse format
sparse_query_tfidf = [(term_id, weight) for term_id, weight in query_tfidf.items()]

# Step 7: Calculate the relevance of the query to each document
relevance = defaultdict(float)
for doc_id in sparse_tfidf:
    relevance[doc_id] = sum(query_weight * doc_weight for term_id, query_weight in sparse_query_tfidf 
                            for term_id_doc, doc_weight in sparse_tfidf[doc_id] if term_id == term_id_doc)

# Step 8: Display the relevance of the query to each document
print("Relevance of the query to each document:")
for doc_id, score in relevance.items():
    print(f"Document {doc_id + 1}: {score:.4f}")


In [None]:
data1.to_csv("preprocessed_data.csv", index=False)
