In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import os
from models import Corpus, Document, SearchEngine  # Adjust based on your project structure

nltk.download("punkt_tab")


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/alireza/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
print(os.getcwd())
file_path = './data/discours_US.csv'  # Update with the actual path to your CSV file
with open(file_path, "r+") as file:
    df = pd.read_csv(file, sep="\t")

/home/alireza/Desktop/Fateme Project/pyproject


In [3]:
print(df.head())

   speaker                                               text            date  \
0  CLINTON  : I'm getting ready for a lot of things, a lot...  April 12, 2015   
1  CLINTON  [ ] : I'll be graduating in May, and on gradua...  April 14, 2015   
2  CLINTON  : Well, thank you all so much for inviting me ...  April 20, 2015   
3  CLINTON  Thank you so much. I am absolutely delighted t...  April 29, 2015   
4  CLINTON  Oh, hello. Hi, how are you? Well, it's wonderf...     May 5, 2015   

                                               descr  \
0   Video Remarks Announcing Candidacy for President   
1  Remarks in a Question and Answer Session at Ki...   
2                    Remarks in Keene, New Hampshire   
3  Address to the David N. Dinkins Leadership & P...   
4  Remarks at a Roundtable with Young Nevada Resi...   

                                                link  
0  http://www.presidency.ucsb.edu/ws/index.php?pi...  
1  http://www.presidency.ucsb.edu/ws/index.php?pi...  
2  http://w

In [4]:
# Check the distribution of speech authors
author_counts = df['speaker'].value_counts()  # Replace 'author' with the actual column name if different
print(author_counts)

speaker
CLINTON    93
TRUMP      71
Name: count, dtype: int64


In [5]:
corpus = Corpus.Corpus("us_speech")
# Iterate over the rows in the DataFrame and add documents
for index, row in df.iterrows():
    speech_text = row['text']  # Replace 'speech' with the actual column name containing the speech text
    sentences = sent_tokenize(speech_text)  # Split the speech into sentences
    for sentence in sentences:
        document = Document.Document(titre=row["descr"],texte=sentence, auteur=row["speaker"], date=row["date"], url=row["link"])  # Adjust metadata as needed
        corpus.add(document)

# Confirm the number of documents added
print(f"Total documents in the corpus: {corpus.ndoc}")

Total documents in the corpus: 32460


In [6]:
# Test the search functionality
keywords = r"freedom"  # Replace with any keyword you want to search for
top_n = 10  # Number of top documents to return
# Test the concordance functionality
concordance_results = corpus.concorde(keywords, top_n)  # Assuming your Corpus class has a concordance method

# Display the concordance results
print("Concordance Results for '{}':".format(keywords))
print(concordance_results)


Concordance Results for 'freedom':
      match                      context
0   freedom  circle of freedom and oppor
1   freedom  ights and freedom is what's
2   freedom  k down on freedom of expres
3   freedom  s greater freedom of moveme
4   freedom  nted more freedom, wanted t
5   freedom  friend to freedom, it's not
6   freedom  orship in freedom, nations 
7   freedom  uggle for freedom and self-
8   freedom  religious freedom our count
9   freedom  epresents freedom and hope 
10  freedom  ishes our freedom and indep
11  freedom  g values, freedom and equal
12  freedom  s choice, freedom and oppor
13  freedom  iority of freedom over comm
14  freedom  hoice and freedom and contr
15  freedom  mined our freedom and indep
16  freedom  urity and freedom, a countr
17  freedom   the same freedom from fear
18  freedom   party of freedom, equality
19  freedom  efend our freedom, our jobs
20  freedom   have the freedom – the civ
21  freedom  rties and freedoms of all A
22  freedom  chool the

In [7]:
search_engine = SearchEngine.SearchEngine(corpus)

Building SearchEnginge Vocabulary: 100%|█████████████████████| 32460/32460 [00:00<00:00, 168386.15doc/s]


SearchEnginge Vocabulary Built.


Build SearchEnginge Term Frequency Matrix ...: 100%|█████████| 32460/32460 [00:00<00:00, 115518.80doc/s]


SearchEnginge Term Frequency Matrix built.


Building SearchEngine TfIdf Matrix ...: 100%|████████████████| 25875/25875 [00:00<00:00, 4112919.85it/s]

SearchEngine TfIdf Matrix Built.





In [8]:
# Define a list of test queries
test_queries = [
    "freedom",
    "justice",
    "America",
    "speech",
    "civil rights",
]

# Function to perform searches and print results
def test_search_engine(queries):
    for query in queries:
        print(f"\nSearch Results for '{query}':")
        results = search_engine.search(query, 10)  # Assuming your SearchEngine class has a search method
        if not results.empty:
            print(results)  # Print each result; adjust based on your Document representation
        else:
            print("No results found.")

# Execute the test
test_search_engine(test_queries)


Search Results for 'freedom':


Searching ...: 100%|██████████████████████████████████████████| 32460/32460 [00:04<00:00, 6743.49Docs/s]


   document_index     score  \
0            9400  0.484785   
1           24138  0.426167   
2           21595  0.418481   
3           17983  0.408249   
4           17978  0.357845   
5           30394  0.351469   
6           24324  0.347354   
7           18714  0.325552   
8           14055  0.308315   
9           14621  0.302155   

                                            document   author  
0      Remarks in San Diego, California, par CLINTON  CLINTON  
1      Debate between Trump and Clinton, par CLINTON  CLINTON  
2  Remarks at the Suburban Collection Showplace i...    TRUMP  
3  Remarks at a Rally at the James L. Knight Cent...    TRUMP  
4  Remarks at a Rally at the James L. Knight Cent...    TRUMP  
5  Remarks at Coastal Credit Union Music Park in ...  CLINTON  
6      Debate between Trump and Clinton, par CLINTON  CLINTON  
7  Remarks at the Frontline Outreach Center in Or...  CLINTON  
8  Remarks at the Charlotte Convention Center in ...    TRUMP  
9  Remarks at the 

Searching ...: 100%|██████████████████████████████████████████| 32460/32460 [00:05<00:00, 6457.10Docs/s]


   document_index     score  \
0           30358  0.418416   
1             485  0.398241   
2           30159  0.391196   
3           18068  0.385236   
4             340  0.383655   
5           25019  0.381738   
6           30384  0.374718   
7             461  0.370518   
8           22677  0.360103   
9           22130  0.342036   

                                            document   author  
0  Remarks at Coastal Credit Union Music Park in ...  CLINTON  
1  Address to the David N. Dinkins Leadership & P...  CLINTON  
2  Remarks at Pitt Community College in Wintervil...  CLINTON  
3  Remarks to the Black Women's Agenda Symposium ...  CLINTON  
4  Address to the David N. Dinkins Leadership & P...  CLINTON  
5  Remarks at the Renaissance Hotel in Columbus, ...    TRUMP  
6  Remarks at Coastal Credit Union Music Park in ...  CLINTON  
7  Address to the David N. Dinkins Leadership & P...  CLINTON  
8  Remarks at Goodyear Hall and Theater in Akron,...  CLINTON  
9  Remarks at Litt

Searching ...: 100%|██████████████████████████████████████████| 32460/32460 [00:04<00:00, 6979.62Docs/s]


   document_index     score  \
0           29682  0.636875   
1           30743  0.594741   
2           26248  0.579996   
3           28144  0.515132   
4           19091  0.507694   
5           19223  0.507694   
6           14427  0.499647   
7           31536  0.498071   
8           12856  0.493288   
9           12859  0.493288   

                                            document   author  
0  Remarks at Smale Riverfront Park in Cincinnati...  CLINTON  
1  Remarks at Eastern Market in Detroit, Michigan...  CLINTON  
2      Debate between Trump and Clinton, par CLINTON  CLINTON  
3  Remarks at Broward College's North Campus in C...  CLINTON  
4  Remarks at a Rally at Sun Center Studios in Ch...    TRUMP  
5  Remarks at a Rally at Berglund Center in Roano...    TRUMP  
6  Remarks at the Summit Sports and Ice Complex i...    TRUMP  
7  Remarks at J.S Dorton Arena in Raleigh, North ...    TRUMP  
8  Remarks at a Rally at the University of North ...    TRUMP  
9  Remarks at a Ra

Searching ...: 100%|██████████████████████████████████████████| 32460/32460 [00:04<00:00, 6961.74Docs/s]


   document_index     score  \
0            4982  0.574151   
1           13463  0.492276   
2           12313  0.475807   
3           18335  0.423454   
4            2855  0.406205   
5           21023  0.375983   
6           10492  0.368782   
7           18623  0.357478   
8           31909  0.350240   
9            2124  0.346292   

                                            document   author  
0           Interview with Charlie Rose, par CLINTON  CLINTON  
1  Remarks at Youngstown State University in Youn...    TRUMP  
2  Remarks at the KI Convention Center in Green B...    TRUMP  
3  Remarks at Temple University in Philadelphia, ...  CLINTON  
4  Interview with Brianna Keilar of CNN's \State ...  CLINTON  
5  Remarks at the University of New Hampshire in ...  CLINTON  
6  Address Accepting the Presidential Nomination ...    TRUMP  
7  Remarks at the Frontline Outreach Center in Or...  CLINTON  
8  Remarks at North Carolina State University in ...  CLINTON  
9  Remarks and a Q

Searching ...: 100%|██████████████████████████████████████████| 32460/32460 [00:04<00:00, 6881.26Docs/s]

   document_index     score  \
0            6622  0.667022   
1           16182  0.641971   
2           30906  0.625811   
3           11749  0.512700   
4           18073  0.496987   
5           32058  0.491679   
6           28286  0.486742   
7           31763  0.439761   
8           19202  0.432862   
9           16635  0.404289   

                                            document   author  
0  Interview with Alisyn Camerota of CNN, par CLI...  CLINTON  
1  Remarks at the Cleveland Arts and Social Scien...    TRUMP  
2  Remarks at Eastern Market in Detroit, Michigan...  CLINTON  
3  Address Accepting the Presidential Nomination ...  CLINTON  
4  Remarks to the Black Women's Agenda Symposium ...  CLINTON  
5  Remarks at Grand Valley State University in Gr...  CLINTON  
6  Remarks at McGlohon Theatre at Spirit Square i...    TRUMP  
7  Remarks at the University of Pittsburgh, par C...  CLINTON  
8  Remarks at a Rally at Berglund Center in Roano...    TRUMP  
9  Remarks to the 




In [21]:
import ipywidgets as widgets
from IPython.display import display

# Create a label for the title
title_label = widgets.Label(value="Search Engine Interface")

# Create a text input for keywords
keywords_input = widgets.Text(
    description='Keywords:',
    placeholder='Enter keywords separated by commas'
)

# Create an IntSlider for the number of documents to return
num_docs_slider = widgets.IntSlider(
    value=1,
    min=1,
    max=100,
    step=1,
    description='Num Docs:',
    continuous_update=False
)

authors = [author.name for _, author in corpus.authors.items()]

# Create a dropdown for author filter
author_filter = widgets.Dropdown(
    options=['All Authors', *authors],
    value='All Authors',
    description='Author:'
)

# Create a button to trigger the search
search_button = widgets.Button(description="Search")

# Create an Output widget to display search results
output_area = widgets.Output()

# Add the author filter to the interface
ui_with_filters = widgets.VBox([title_label, keywords_input, num_docs_slider, author_filter, search_button, output_area])
display(ui_with_filters)


def on_search_button_click_with_filters(b):
    # Clear previous output
    output_area.clear_output()
    
    # Get the values from the widgets
    keywords = keywords_input.value
    num_docs = num_docs_slider.value
    author = author_filter.value if author_filter.value != 'All Authors' else None
    
    # Display the results in the Output area
    with output_area:
        results = search_engine.search(keywords, num_docs)
        results = results[results["score"]>0]
        if author is not None:
            results = results[results["author"] == author]
        for index, result in results.iterrows():
            display(f"************{index}.{result.author}************")
            display(result.document.texte)


# Bind the button click event to the updated function
search_button.on_click(on_search_button_click_with_filters)

VBox(children=(Label(value='Search Engine Interface'), Text(value='', description='Keywords:', placeholder='En…