In [1]:
import spacy
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load text file
file_path = "cleaned_20th_century_text.txt"

try:
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
    print("File loaded successfully! Preview:\n")
    print(text[:500])  # Show first 500 characters as a preview
except FileNotFoundError:
    print("Error: File not found. Please check the file path.")

File loaded successfully! Preview:


Contents
Current events
Random article
About Wikipedia
Contact us
Learn to edit
Community portal
Recent changes
Special pages
Donate
Create account
 Create account
 
Contributions
Top
Summary
Nature of innovation and change
Social change
Earth at the end of the 20th century
Wars and politics
Culture and entertainment
Toggle Culture and entertainment subsection
.
Music
Film, television and theatre
Video games
Art and architecture
Sport
Science
Toggle Science subsection
Mathematics
Physics
Astron


In [3]:
import re
special_chars = re.findall(r"[^a-zA-Z0-9.,;!?'\s]", text)
print(f"Special characters found: {set(special_chars)}")  # Unique special characters

Special characters found: {'Л', 'Ч', 'າ', 'ế', '-', 'Æ', 'ц', 'ള', 'Б', 'ç', 'ى', 'ø', 'Н', 'î', 'ե', 'ழ', 'ਜ', 'õ', 'ລ', 'Т', 'හ', 'ת', 'ማ', 'ə', 'ص', 'ج', 'म', '閩', 'မ', 'უ', 'з', '어', 'م', 'Հ', 'ह', 'á', '语', 'Р', 'ښ', 'ქ', 'О', 'Ε', '客', 'യ', 'ҷ', 'ó', 'ն', 'я', 'ம', 'த', 'Ӏ', 'у', 'ń', 'മ', 'ʻ', 'с', 'د', 'ल', 'х', 'သ', 'é', 'ب', 'ة', 'ր', 'త', 'ဘ', 'ї', 'र', 'ъ', '文', 'Г', 'ల', 'ў', 'Ž', 'ਬ', 'प', 'ۆ', 'М', 'ש', 'ن', 'ل', 'ي', 'ì', 'न', 'አ', 'గ', 'д', 'द', '中', 'յ', 'ল', 'س', '東', '南', 'ר', 'г', 'č', 'ӣ', 'è', 'ă', 'і', 'ד', 'à', 'ů', 'ά', '吴', 'ز', 'ш', 'و', 'ı', 'š', 'и', 'ê', 'л', 'ی', 'С', 'а', 'ლ', 'ر', 'У', 'â', 'ت', 'ι', '粵', '本', 'т', 'ठ', 'ه', 'ኛ', 'й', 'ñ', 'ย', 'ი', 'ท', 'რ', 'ā', 'ӑ', 'მ', 'ල', '贛', 'ҡ', 'К', 'Ś', '국', 'ա', 'ν', 'н', 'ĕ', 'ş', 'ė', 'န', 'ല', 'б', 'ë', 'ไ', 'о', 'Э', 'ü', 'پ', 'е', 'Í', 'И', 'ວ', 'ف', '語', 'å', 'р', 'ב', 'ა', '한', 'ע', 'ļ', '日', 'ч', 'к', 'ṳ', 'қ', 'ၽ', 'ú', 'λ', 'თ', 'Қ', 'ò', '家', 'κ', 'י', 'м', 'в', 'ا', 'ų', 'တ', 'ь', 'ਪ', 'ব', 'გ'

In [4]:
# Remove special characters (except common punctuation)
cleaned_text = re.sub(r"[^a-zA-Z0-9.,;!?'\s]", "", text)

In [5]:
# Normalize whitespace
cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

In [6]:
# Save cleaned text
with open("cleaned_text.txt", "w", encoding="utf-8") as file:
    file.write(cleaned_text)

In [7]:
print("\nCleaned text preview:\n", cleaned_text[:500])


Cleaned text preview:
 Contents Current events Random article About Wikipedia Contact us Learn to edit Community portal Recent changes Special pages Donate Create account Create account Contributions Top Summary Nature of innovation and change Social change Earth at the end of the 20th century Wars and politics Culture and entertainment Toggle Culture and entertainment subsection . Music Film, television and theatre Video games Art and architecture Sport Science Toggle Science subsection Mathematics Physics Astronomy 


In [8]:
# Load spacy English model
nlp = spacy.load("en_core_web_sm")

In [9]:
# Create an NLP document
doc = nlp(cleaned_text)

In [10]:
# Display named entities 
for ent in doc.ents[:10]:  # Show first 10 entities
    print(f"{ent.text} - {ent.label_}")

Community - ORG
Earth - LOC
the end of the 20th century - DATE
Toggle Culture - ORG
Sport Science Toggle Science subsection Mathematics Physics Astronomy Agriculture Biology Medicine - ORG
Energy - ORG
Engineering - PERSON
Toggle Engineering - ORG
External - ORG
Afrikaans - NORP


In [11]:
# Extract all sentences from the document
sentences = list(doc.sents)

In [12]:
# Loop through all sentences and display their named entities
for sent in sentences:
    print(f"\nSentence: {sent.text}")  # Print the sentence
    for ent in sent.ents:
        print(f" - Entity: {ent.text} ({ent.label_})")  # Print each entity found


Sentence: Contents Current events Random article About Wikipedia Contact us Learn to edit Community portal Recent changes Special pages Donate Create account Create account Contributions
 - Entity: Community (ORG)

Sentence: Top Summary Nature of innovation and change Social change Earth at the end of the 20th century Wars and politics Culture and entertainment Toggle Culture and entertainment subsection .
 - Entity: Earth (LOC)
 - Entity: the end of the 20th century (DATE)
 - Entity: Toggle Culture (ORG)

Sentence: Music Film, television and theatre Video games Art and architecture Sport Science Toggle Science subsection Mathematics Physics Astronomy Agriculture Biology Medicine .. Notable diseases Energy and the environment Engineering and technology Toggle Engineering and technology subsection Space exploration Religion Economics See also References Sources Further reading External links Afrikaans Alemannisch nglisc Aragons Arpetan Asturianu Azrbaycanca Basa Bali Bnlmg Basa Banyuma

In [29]:
# Example country list
country_list = ["United States", "Germany", "France", "India", "China", "United Kingdom", "Canada","Russia"]

In [31]:
# Extract country names
country_entities = [ent.text for ent in doc.ents if ent.label_ == "GPE" and ent.text in country_list]

In [33]:
print("\nFiltered Country Entities:")
print(country_entities)


Filtered Country Entities:
['China', 'Germany', 'France', 'China', 'France', 'China', 'India', 'China', 'China', 'Canada', 'Russia', 'France']


In [35]:
# Create DataFrame
df = pd.DataFrame({"Country": country_entities})

In [39]:
# Display first few rows
df.head()

Unnamed: 0,Country
0,China
1,Germany
2,France
3,China
4,France


In [41]:
df.to_csv("country_entities.csv", index=False)

In [43]:
print("DataFrame saved as 'country_entities.csv'.")

DataFrame saved as 'country_entities.csv'.
