## Execises: Module 13 - Python Date Time and File Handling

### Python Datetime Execises

In [None]:
# 1. Get the current day, month, year, hour, minute, and timestamp from the datetime module.

from datetime import datetime

now = datetime.now()

current_day = now.day
current_month = now.month
current_year = now.year
current_hour = now.hour
current_minute = now.minute
current_timestamp = now.timestamp()

print("Current day: ", current_day)
print("Current month: ", current_month)
print("Current year: ", current_year)
print("Current hour: ", current_hour)
print("Current minute: ", current_minute)   
print("Current timestamp: ", current_timestamp)



In [None]:
# 2. Format the current date in the following formats: "%m/%d/%Y, %H:%M:%S"

now = datetime.now()
current_date = now.strftime("%m/%d/%Y, %H:%M:%S")
print("Current date: ", current_date)

In [None]:
# 3. Today is 5 December, 2019. Change this time string to time.

time_string = "5 December, 2019"
time_object = datetime.strptime(time_string, "%d %B, %Y")
print(time_object)

In [None]:
# 4. Calculate the time difference between now and new year.

now = datetime.now()
new_year = datetime.strptime("1 January, 2020", "%d %B, %Y")
time_difference = now - new_year
print(time_difference)

In [None]:
# 5. Calculate the time difference between 1 January 1970 and now.

now = datetime.now()
old_date = datetime.strptime("1 January, 1970", "%d %B, %Y")
time_difference = now - old_date
print(time_difference)

 6. Think, what can you use the datetime module for? Examples:
- Time series analysis
- To get a timestamp of any activities in an application
-  Adding post on a blog



**1. Time Series Analysis**
- Manage and analyze datasets indexed by time, such as stock market data or climate records.
- Perform operations like filtering data by date ranges or resampling for different time intervals.

**2. Timestamps for Activities**
- Log specific times for application events, like when a user logs in or performs certain actions.
- Track the exact time a file was created, modified, or accessed in a program.

**3. Adding Timestamps to Blog Posts**
- Automatically record the date and time when a blog post or comment is created.
- Display a "last updated" timestamp for edited posts.

**4. Scheduling Tasks**
- Automate reminders, notifications, or tasks based on specific times or intervals.
- Trigger functions or scripts to run at particular times (e.g., send an email daily at noon).

**5. Calculate Time Differences**
- Determine the duration between two events, like the time elapsed between a project start and end.
- Calculate a person's age by comparing their birth date with the current date.

**6. Formatting Dates**
- Convert dates into human-readable formats (e.g., "January 26, 2025").
- Standardize date formats for consistency across an application.


**7. Stopwatch or Timer**
- Measure how long a process or function takes to execute.
- Track performance metrics for operations in a program.

### File Handling Execises

### Exercises: Level 1

In [6]:
# Write a function which count number of lines and number of words in a text. All the files are in the data the folder: 
# a) Read obama_speech.txt file and count number of lines and words 
# b) Read michelle_obama_speech.txt file and count number of lines and words 
# c) Read donald_speech.txt file and count number of lines and words 
# d) Read melina_trump_speech.txt file and count number of lines and words

from pathlib import Path

def count_lines_and_words(file):
    path = Path(file)
    content = path.read_text()
    lines = content.splitlines()
    words = content.split()
    return len(lines), len(words)

obama_speech_stats = count_lines_and_words('obama_speech.txt')
print(
    f"The Obama speech has {obama_speech_stats[0]} "
    f"lines and {obama_speech_stats[1]} words."
)

michelle_speech_stats = count_lines_and_words('michelle_obama_speech.txt')
print(
    f"The Obama speech has {michelle_speech_stats[0]} "
    f"lines and {michelle_speech_stats[1]} words."
)

donald_speech_stats = count_lines_and_words('donald_speech.txt')
print(
    f"The Obama speech has {donald_speech_stats[0]} "
    f"lines and {donald_speech_stats[1]} words."
)

melina_speech_stats = count_lines_and_words('melina_trump_speech.txt')
print(
    f"The Obama speech has {melina_speech_stats[0]} "
    f"lines and {melina_speech_stats[1]} words."
)

The Obama speech has 66 lines and 2400 words.
The Obama speech has 35 lines and 1085 words.
The Obama speech has 48 lines and 1259 words.
The Obama speech has 33 lines and 1375 words.


In [10]:
# 2. Read the countries_data.json data file in data directory, create a function that finds the ten most spoken languages

import json
from pathlib import Path
from collections import Counter

def ten_most_spoken_languages(countries):
    """ Find ten most spoken languages in the world """
    language_counter = Counter()
    for country in countries:
        languages = country['languages']
        language_counter.update(languages)
    return language_counter.most_common(10)

path = Path('countries_data.json')
countries_data = json.loads(path.read_text(encoding='utf-8'))

print('Most Spoken Languages:')
most_spoken_languages = ten_most_spoken_languages(countries_data)
for key, value in most_spoken_languages:
    print(f"{key}: {value}")

Most Spoken Languages:
English: 91
French: 45
Arabic: 25
Spanish: 24
Portuguese: 9
Russian: 9
Dutch: 8
German: 7
Chinese: 5
Serbian: 4


In [13]:
# 3. Read the countries_data.json data file in data directory, create a function that finds the ten most populated countries

import json
from pathlib import Path
from collections import Counter

def ten_most_populated_countries(countries):
    """ Find ten most populated countries in the world """
    country_counter = Counter()
    for country in countries:
        country_name = country['name']
        population = country['population']
        country_counter[country_name] = population
    return country_counter.most_common(10)

path = Path('countries_data.json')
countries_data = json.loads(path.read_text(encoding='utf-8'))

print('Most Populated Countries:')
most_populated_countries = ten_most_populated_countries(countries_data)
for key, value in most_populated_countries:
    print(f" Country: {key}, Population: {value}")    

Most Populated Countries:
 Country: China, Population: 1377422166
 Country: India, Population: 1295210000
 Country: United States of America, Population: 323947000
 Country: Indonesia, Population: 258705000
 Country: Brazil, Population: 206135893
 Country: Pakistan, Population: 194125062
 Country: Nigeria, Population: 186988000
 Country: Bangladesh, Population: 161006790
 Country: Russian Federation, Population: 146599183
 Country: Japan, Population: 126960000


### Exercises: Level 2

In [20]:
# 4.Extract all incoming email addresses as a list from the email_exchange_big.txt file

import re
from pathlib import Path

path = Path('email_exchange_big.txt')
content = path.read_text()
lines = content.splitlines()
email_addresses = []
for line in lines:
    email_addresses.extend(re.findall(r'\b\w+@\w+\.\w+\b', line)) # use regular
    # expression to find email addresses
print(email_addresses)

['marquard@uct.ac', 'postmaster@collab.sakaiproject', 'm05ECIaH010327@nakamura.uits', 'source@collab.sakaiproject', 'source@collab.sakaiproject', 'source@collab.sakaiproject', 'source@collab.sakaiproject', 'marquard@uct.ac', 'source@collab.sakaiproject', 'marquard@uct.ac', 'marquard@uct.ac', 'louis@media.berkeley', 'postmaster@collab.sakaiproject', 'm04N8v6O008125@nakamura.uits', 'source@collab.sakaiproject', 'source@collab.sakaiproject', 'source@collab.sakaiproject', 'source@collab.sakaiproject', 'louis@media.berkeley', 'source@collab.sakaiproject', 'louis@media.berkeley', 'louis@media.berkeley', 'zqian@umich.edu', 'postmaster@collab.sakaiproject', 'm04L92hb007923@nakamura.uits', 'source@collab.sakaiproject', 'source@collab.sakaiproject', 'source@collab.sakaiproject', 'source@collab.sakaiproject', 'zqian@umich.edu', 'source@collab.sakaiproject', 'zqian@umich.edu', 'zqian@umich.edu', 'rjlowe@iupui.edu', 'postmaster@collab.sakaiproject', 'm04Kiem3007881@nakamura.uits', 'source@collab.sa

In [21]:
# 5. Find the most common words in the English language. Call the name of your function find_most_common_words, it will take two parameters 
# - a string or a file and a positive integer, indicating the number of words. Your function will return an array of tuples in descending order

import re

def find_most_common_words(filename,num_words):
    pattern = r'[\-–—]'
    words_dict = {}
    try:
        with open(filename) as file_obj:
            text = file_obj.read()
            text = re.sub(pattern, '',text)
            word_list = text.split()
            for word in word_list:
                if word in words_dict:
                    words_dict[word] += 1
                else:
                    words_dict[word] = 1
    except FileNotFoundError:
        print('Sorry file does not exist')
        return None
   
    sorted_dict_val_list = dict(sorted(words_dict.items(), key=lambda val: val[1], reverse=True))
    return list(sorted_dict_val_list.items())[:num_words]

x = find_most_common_words('obama_speech.txt',10)

print(x)

[('the', 120), ('and', 107), ('of', 81), ('to', 66), ('our', 58), ('we', 50), ('a', 48), ('that', 47), ('is', 36), ('in', 22)]


In [22]:
# 6. Use the function, find_most_frequent_words to find: 
# a) The ten most frequent words used in Obama's speech 
# b) The ten most frequent words used in Michelle's speech 
# c) The ten most frequent words used in Trump's speech 
# d) The ten most frequent words used in Melina's speech


obama = find_most_common_words('obama_speech.txt',10)
print(f'Obama = {obama}')
michelle = find_most_common_words('michelle_obama_speech.txt',10)
print(f'Michelle = {michelle}')
trump = find_most_common_words('donald_speech.txt',10)
print(f'Trump = {trump}')
melina = find_most_common_words('melina_trump_speech.txt',10)
print(f'Melina = {melina}')


Obama = [('the', 120), ('and', 107), ('of', 81), ('to', 66), ('our', 58), ('we', 50), ('a', 48), ('that', 47), ('is', 36), ('in', 22)]
Michelle = [('and', 47), ('to', 37), ('the', 34), ('a', 22), ('that', 20), ('my', 19), ('of', 18), ('our', 17), ('I', 16), ('in', 16)]
Trump = [('the', 61), ('and', 53), ('will', 40), ('of', 38), ('to', 32), ('our', 30), ('we', 26), ('is', 20), ('We', 15), ('America', 14)]
Melina = [('and', 73), ('to', 54), ('the', 48), ('I', 28), ('is', 28), ('for', 27), ('of', 25), ('a', 22), ('that', 19), ('Donald', 17)]


In [25]:
# 7. Write a python application that checks similarity between two texts. It takes a file or a string as a parameter and it will evaluate the similarity of the two texts. 
# For instance check the similarity between the transcripts of Michelle's and Melina's speech. You may need a couple of functions, function to clean the text(clean_text), 
# function to remove support words(remove_support_words) and finally to check the similarity(check_text_similarity). List of stop words are in the data directory


from stops_words import stop_words

def clean_text(text):
    pattern = r'[%$@&#;!\n–-]'
    return re.sub(pattern,'',text)

def remove_support_words(words_list):
    stop_words_list = stop_words
    return [word.lower() for word in words_list if word.lower() not in stop_words_list and word != '']
def similarity(filepath_1,filepath_2):
    try:
        with open(filepath_1) as f_obj:
            text_1 = f_obj.read()
        with open(filepath_2) as f_obj:
            text_2 = f_obj.read()
    except FileNotFoundError:
        print("File Path not found!")
        return None
    
    cleaned_text_1 = clean_text(text_1)
    cleaned_text_2 = clean_text(text_2)

    unique_no_support_1 = set(remove_support_words(cleaned_text_1.split()))

    unique_no_support_2 = set(remove_support_words(cleaned_text_2.split()))

    similar_words = [word for word in unique_no_support_1 if word in unique_no_support_2]

    all_word_list =  unique_no_support_1.union(unique_no_support_2)

    percent_score = (len(similar_words) / len(all_word_list)) * 100

    
    return round(percent_score,2)
    
    
trump_vs_obama = similarity('donald_speech.txt','obama_speech.txt')

print('Trump and Obama speech has similarity score of {}%'.format(trump_vs_obama))

   
obama_vs_obama = similarity('obama_speech.txt','obama_speech.txt')

print('Obama and Obama speech has similarity score of {}%'.format(obama_vs_obama))


Trump and Obama speech has similarity score of 9.15%
Obama and Obama speech has similarity score of 100.0%


In [27]:
# 8. Find the 10 most repeated words in the romeo_and_juliet.txt

find_most_common_words('romeo_and_juliet.txt',10)

[('of', 3),
 ('and', 3),
 ('THIS', 3),
 ('EBOOK', 3),
 ('OF', 3),
 ('at', 3),
 ('the', 3),
 ('Project', 2),
 ('Gutenberg', 2),
 ('Romeo', 2)]

In [28]:
# 9. Read the hacker news csv file and find out: 
# a) Count the number of lines containing python or Python 
# b) Count the number lines containing JavaScript, javascript or Javascript 
# c) Count the number lines containing Java and not JavaScript

import csv
from pathlib import Path

def count_lines_with_keywords(file_path, keywords):
    """Count the number of lines containing any of the specified keywords."""
    count = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            line = ' '.join(row)
            if any(keyword in line for keyword in keywords):
                count += 1
    return count

def count_lines_with_java_not_javascript(file_path):
    """Count the number of lines containing 'Java' but not 'JavaScript'."""
    count = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            line = ' '.join(row)
            if 'Java' in line and 'JavaScript' not in line and 'javascript' not in line and 'Javascript' not in line:
                count += 1
    return count

file_path = Path('hacker_news.csv')

# Count lines containing 'python' or 'Python'
python_count = count_lines_with_keywords(file_path, ['python', 'Python'])
print(f"Number of lines containing 'python' or 'Python': {python_count}")

# Count lines containing 'JavaScript', 'javascript' or 'Javascript'
javascript_count = count_lines_with_keywords(file_path, ['JavaScript', 'javascript', 'Javascript'])
print(f"Number of lines containing 'JavaScript', 'javascript' or 'Javascript': {javascript_count}")

# Count lines containing 'Java' but not 'JavaScript'
java_not_javascript_count = count_lines_with_java_not_javascript(file_path)
print(f"Number of lines containing 'Java' but not 'JavaScript': {java_not_javascript_count}")

Number of lines containing 'python' or 'Python': 179
Number of lines containing 'JavaScript', 'javascript' or 'Javascript': 184
Number of lines containing 'Java' but not 'JavaScript': 53
