In [1]:
import pandas as pd  # Importing pandas for data manipulation and analysis
import nltk  # Importing the Natural Language Toolkit for text processing
from nltk.tokenize import (
    sent_tokenize,
)  # Importing functions for sentence and word tokenization

# Importing custom models for handling corpus, documents, and search functionality
from models import (
    Corpus,
    Document,
    SearchEngine,
)  # Importing Models

# Download the Punkt tokenizer models, which are used for sentence and word tokenization
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/alireza/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# Define the file path to the CSV file containing the data
file_path = "./data/discours_US.csv"

# Open the specified CSV file in read and write mode
with open(file_path, "r+") as file:
    # Read the contents of the CSV file into a pandas DataFrame
    df = pd.read_csv(file, sep="\t")  # Specify tab as the separator for the CSV file

In [3]:
# Displays the top rows of the DataFrame to verify the data structure and contents
print(df.head())

   speaker                                               text            date  \
0  CLINTON  : I'm getting ready for a lot of things, a lot...  April 12, 2015   
1  CLINTON  [ ] : I'll be graduating in May, and on gradua...  April 14, 2015   
2  CLINTON  : Well, thank you all so much for inviting me ...  April 20, 2015   
3  CLINTON  Thank you so much. I am absolutely delighted t...  April 29, 2015   
4  CLINTON  Oh, hello. Hi, how are you? Well, it's wonderf...     May 5, 2015   

                                               descr  \
0   Video Remarks Announcing Candidacy for President   
1  Remarks in a Question and Answer Session at Ki...   
2                    Remarks in Keene, New Hampshire   
3  Address to the David N. Dinkins Leadership & P...   
4  Remarks at a Roundtable with Young Nevada Resi...   

                                                link  
0  http://www.presidency.ucsb.edu/ws/index.php?pi...  
1  http://www.presidency.ucsb.edu/ws/index.php?pi...  
2  http://w

In [4]:
# Check the distribution of speech authors by counting occurrences of each speaker
author_counts = df[
    "speaker"
].value_counts()  # Count the number of speeches by each speaker

# Print the counts of speeches for each author to the console
print(author_counts)  # Displays the distribution of speeches among different speakers

speaker
CLINTON    93
TRUMP      71
Name: count, dtype: int64


In [5]:
# Create a new Corpus object to hold the speeches
corpus = Corpus.Corpus("us_speech")

# Iterate over the rows in the DataFrame to add each speech as a document
for index, row in df.iterrows():
    # Extract the speech text from the DataFrame
    speech_text = row[
        "text"
    ] 

    # Split the speech into individual sentences
    sentences = sent_tokenize(speech_text)  # Tokenize the speech into sentences

    # Iterate over each sentence to create a Document object
    for sentence in sentences:
        # Create a new Document object with metadata from the DataFrame
        document = Document.Document(
            titre=row["descr"],  # Title of the speech
            texte=sentence,  # The sentence text
            auteur=row["speaker"],  # Author of the speech
            date=row["date"],  # Date of the speech
            url=row["link"],  # Link to the speech
        )  # Adjust metadata as needed

        # Add the created document to the corpus
        corpus.add(document)

# Confirm the total number of documents added to the corpus
print(
    f"Total documents in the corpus: {corpus.ndoc}"
)  # Display the count of documents in the corpus

Total documents in the corpus: 32460


In [11]:
# Define the keyword to search for in the corpus
keywords = r"freedom"

# Specify the context size of string return based on the search
context_size = 15

# Test the concordance functionality of the corpus
concordance_results = corpus.concorde(
    keywords, context_size
)

# Display the concordance results to the console
print("Concordance Results for '{}':".format(keywords))
print(concordance_results)

Concordance Results for 'freedom':
      match                                context
0   freedom   the circle of freedom and opportunit
1   freedom  man rights and freedom is what's draw
2   freedom   crack down on freedom of expression,
3   freedom  Iraqis greater freedom of movement an
4   freedom  ho wanted more freedom, wanted to liv
5   freedom  ot a friend to freedom, it's not a fr
6   freedom  and worship in freedom, nations built
7   freedom  o struggle for freedom and self-deter
8   freedom   the religious freedom our country wa
9   freedom  .It represents freedom and hope and o
10  freedom  diminishes our freedom and independen
11  freedom  during values, freedom and equality, 
12  freedom  udents choice, freedom and opportunit
13  freedom  superiority of freedom over communism
14  freedom  you choice and freedom and control in
15  freedom  undermined our freedom and independen
16  freedom  f security and freedom, a country of 
17  freedom  enjoy the same freedom from fear t

In [7]:
# Initialize a SearchEngine object with the previously created corpus
search_engine = SearchEngine.SearchEngine(corpus)

SearchEngine Vocabulary Built.
SearchEngine Term Frequency Matrix built.
SearchEngine TfIdf Matrix Built.


In [8]:
# Define a list of test queries to evaluate the search engine
test_queries = [
    "freedom",
    "justice",
    "America",
    "speech",
    "civil rights",
]


# Function to perform searches using the search engine and print the results
def test_search_engine(queries):
    # Iterate over each query in the provided list
    for query in queries:
        print(
            f"\nSearch Results for '{query}':"
        )  # Print the current query being searched

        # Perform the search using the search engine and limit results to the top 10
        results = search_engine.search(query, 10)

        # Check if any results were returned
        if not results.empty:
            print(results)
        else:
            print("No results found.")  # Inform the user if no results were found


# Execute the test function to perform searches on the test queries
test_search_engine(test_queries)


Search Results for 'freedom':


Searching ...: 100%|███████████████████████████████████████████████████████████████████████████████████| 32460/32460 [00:04<00:00, 7298.68Docs/s]


   document_index     score  \
0            9400  0.484785   
1           24138  0.426167   
2           21595  0.418481   
3           17983  0.408249   
4           17978  0.357845   
5           30394  0.351469   
6           24324  0.347354   
7           18714  0.325552   
8           14055  0.308315   
9           14621  0.302155   

                                            document   author  
0      Remarks in San Diego, California, par CLINTON  CLINTON  
1      Debate between Trump and Clinton, par CLINTON  CLINTON  
2  Remarks at the Suburban Collection Showplace i...    TRUMP  
3  Remarks at a Rally at the James L. Knight Cent...    TRUMP  
4  Remarks at a Rally at the James L. Knight Cent...    TRUMP  
5  Remarks at Coastal Credit Union Music Park in ...  CLINTON  
6      Debate between Trump and Clinton, par CLINTON  CLINTON  
7  Remarks at the Frontline Outreach Center in Or...  CLINTON  
8  Remarks at the Charlotte Convention Center in ...    TRUMP  
9  Remarks at the 

Searching ...: 100%|███████████████████████████████████████████████████████████████████████████████████| 32460/32460 [00:04<00:00, 6607.67Docs/s]


   document_index     score  \
0           30358  0.418416   
1             485  0.398241   
2           30159  0.391196   
3           18068  0.385236   
4             340  0.383655   
5           25019  0.381738   
6           30384  0.374718   
7             461  0.370518   
8           22677  0.360103   
9           22130  0.342036   

                                            document   author  
0  Remarks at Coastal Credit Union Music Park in ...  CLINTON  
1  Address to the David N. Dinkins Leadership & P...  CLINTON  
2  Remarks at Pitt Community College in Wintervil...  CLINTON  
3  Remarks to the Black Women's Agenda Symposium ...  CLINTON  
4  Address to the David N. Dinkins Leadership & P...  CLINTON  
5  Remarks at the Renaissance Hotel in Columbus, ...    TRUMP  
6  Remarks at Coastal Credit Union Music Park in ...  CLINTON  
7  Address to the David N. Dinkins Leadership & P...  CLINTON  
8  Remarks at Goodyear Hall and Theater in Akron,...  CLINTON  
9  Remarks at Litt

Searching ...: 100%|███████████████████████████████████████████████████████████████████████████████████| 32460/32460 [00:04<00:00, 6625.12Docs/s]


   document_index     score  \
0           29682  0.636875   
1           30743  0.594741   
2           26248  0.579996   
3           28144  0.515132   
4           19091  0.507694   
5           19223  0.507694   
6           14427  0.499647   
7           31536  0.498071   
8           12856  0.493288   
9           12859  0.493288   

                                            document   author  
0  Remarks at Smale Riverfront Park in Cincinnati...  CLINTON  
1  Remarks at Eastern Market in Detroit, Michigan...  CLINTON  
2      Debate between Trump and Clinton, par CLINTON  CLINTON  
3  Remarks at Broward College's North Campus in C...  CLINTON  
4  Remarks at a Rally at Sun Center Studios in Ch...    TRUMP  
5  Remarks at a Rally at Berglund Center in Roano...    TRUMP  
6  Remarks at the Summit Sports and Ice Complex i...    TRUMP  
7  Remarks at J.S Dorton Arena in Raleigh, North ...    TRUMP  
8  Remarks at a Rally at the University of North ...    TRUMP  
9  Remarks at a Ra

Searching ...: 100%|███████████████████████████████████████████████████████████████████████████████████| 32460/32460 [00:05<00:00, 6192.68Docs/s]


   document_index     score  \
0            4982  0.574151   
1           13463  0.492276   
2           12313  0.475807   
3           18335  0.423454   
4            2855  0.406205   
5           21023  0.375983   
6           10492  0.368782   
7           18623  0.357478   
8           31909  0.350240   
9            2124  0.346292   

                                            document   author  
0           Interview with Charlie Rose, par CLINTON  CLINTON  
1  Remarks at Youngstown State University in Youn...    TRUMP  
2  Remarks at the KI Convention Center in Green B...    TRUMP  
3  Remarks at Temple University in Philadelphia, ...  CLINTON  
4  Interview with Brianna Keilar of CNN's \State ...  CLINTON  
5  Remarks at the University of New Hampshire in ...  CLINTON  
6  Address Accepting the Presidential Nomination ...    TRUMP  
7  Remarks at the Frontline Outreach Center in Or...  CLINTON  
8  Remarks at North Carolina State University in ...  CLINTON  
9  Remarks and a Q

Searching ...: 100%|███████████████████████████████████████████████████████████████████████████████████| 32460/32460 [00:06<00:00, 4910.18Docs/s]

   document_index     score  \
0            6622  0.667022   
1           16182  0.641971   
2           30906  0.625811   
3           11749  0.512700   
4           18073  0.496987   
5           32058  0.491679   
6           28286  0.486742   
7           31763  0.439761   
8           19202  0.432862   
9           16635  0.404289   

                                            document   author  
0  Interview with Alisyn Camerota of CNN, par CLI...  CLINTON  
1  Remarks at the Cleveland Arts and Social Scien...    TRUMP  
2  Remarks at Eastern Market in Detroit, Michigan...  CLINTON  
3  Address Accepting the Presidential Nomination ...  CLINTON  
4  Remarks to the Black Women's Agenda Symposium ...  CLINTON  
5  Remarks at Grand Valley State University in Gr...  CLINTON  
6  Remarks at McGlohon Theatre at Spirit Square i...    TRUMP  
7  Remarks at the University of Pittsburgh, par C...  CLINTON  
8  Remarks at a Rally at Berglund Center in Roano...    TRUMP  
9  Remarks to the 




In [9]:
import ipywidgets as widgets  # Import the ipywidgets library for creating interactive widgets
from IPython.display import (
    display,
)  # Import display function to show widgets in Jupyter notebooks

# Create a label for the title of the search interface
title_label = widgets.Label(value="Search Engine Interface")

# Create a text input widget for entering keywords
keywords_input = widgets.Text(
    description="Keywords:", placeholder="Enter keywords separated by commas"
)

# Create an IntSlider widget for selecting the number of documents to return
num_docs_slider = widgets.IntSlider(
    value=1, min=1, max=100, step=1, description="Num Docs:", continuous_update=False
)

# Extract author names from the corpus for the dropdown filter
authors = [author.name for _, author in corpus.authors.items()]

# Create a dropdown widget for selecting an author filter
author_filter = widgets.Dropdown(
    options=["All Authors", *authors], value="All Authors", description="Author:"
)

# Create a button widget to trigger the search action
search_button = widgets.Button(description="Search")

# Create an Output widget to display the search results
output_area = widgets.Output()

# Organize the widgets into a vertical box layout for the user interface
ui_with_filters = widgets.VBox(
    [
        title_label,
        keywords_input,
        num_docs_slider,
        author_filter,
        search_button,
        output_area,
    ]
)

# Display the user interface in the notebook
display(ui_with_filters)


# Function to handle the search button click event
def on_search_button_click_with_filters(b):
    # Clear any previous output in the output area
    output_area.clear_output()

    # Retrieve values from the widgets
    keywords = keywords_input.value  # Get the keywords entered by the user
    num_docs = num_docs_slider.value  # Get the number of documents to return
    author = (
        author_filter.value if author_filter.value != "All Authors" else None
    )  # Get the selected author

    # Display the search results in the Output area
    with output_area:
        results = search_engine.search(keywords, num_docs)  # Perform the search
        results = results[
            results["score"] > 0
        ]  # Filter results to include only those with a score > 0
        if author is not None:
            results = results[
                results["author"] == author
            ]  # Filter results by the selected author if applicable

        # Display each result in the output area
        for index, result in results.iterrows():
            display(
                f"************{index}.{result.author}************"
            )  # Display the author and index
            display(result.document.texte)  # Display the text of the document


# Bind the button click event to the search function
search_button.on_click(on_search_button_click_with_filters)

VBox(children=(Label(value='Search Engine Interface'), Text(value='', description='Keywords:', placeholder='En…