In [74]:
import re
import pandas as pd
import os
from bs4 import BeautifulSoup

## **Email Validation**

This program extracts the validated email addresses from the text file given as the input. The extracted email adresses are then exported as an excel file as 'test.xlsx'. It also check for the exsisting file, if any.

In [73]:
def validate_email(text_file):
    with open(text_file, 'r') as file:
        content = file.read()

    expression = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z.]{2,7}'
    words = re.split(r'\s+|[,!?;]', content)
    result = [word.strip(".,!?;") for word in words if re.fullmatch(expression, word)]

    df = pd.DataFrame(result,columns=['Valid Email IDs'])
    file_name = 'test.xlsx'

    if not os.path.exists(file_name):
        df.to_excel('test.xlsx')
    else:
        print('Found an existing file! Updating it to a new one.')
        os.remove(file_name)
        df.to_excel('test.xlsx')

    return "Exported Successfully!"

print(validate_email('test.txt'))

Found an existing file! Updating it to a new one.
Exported Successfully!


## **Parsing of HTML file**
The program parses an HTML file containing a product list, extracts the price of each product, and prints the prices to the console.
For parsing the HTML file, we Beautiful Soup for web scrapping.

In [167]:
html_file = r'HTML-Parser\main.html'
with open(html_file, 'r') as file:
    content = file.read()

soup = BeautifulSoup(content, 'html.parser')

prod_names = soup.find_all('h2')
prod_names = [name.getText()[14:] for name in prod_names]

prod_prices = soup.find_all(class_="price")
prod_prices = [price.getText()[10:] for price in prod_prices]

df = pd.DataFrame({'Product Name': prod_names, 'Product Price (In Rs.)': prod_prices})

df

Unnamed: 0,Product Name,Product Price (In Rs.)
0,Acer Ryzen 5,52999
1,Dell Inspiron 15,50999
2,Lenovo Yoga Slim 7i Gen 9,152999
3,Apple MacBook Air M1 chip,56999


## **Sum of Continuous Integers in a Mixed String**

The program solve for a string containing digits and other characters, and calculate the sum of all integers within the string. Continuous integers must be considered as one number.

In [66]:
def string_sum(word):
    print(f'Input: {word}')
    result = 0
    temp=''
    for char in word:
        if char.isdigit():
            temp+=char
        else:
            if temp:
                result+=int(temp)
                temp=''
    if temp:
        result+=int(temp)
    return result

print(string_sum(input("Enter the string: ")))

Input: a22c5d6
33


## **File Validation**
The program securely validate uploaded files by checking their file signatures (magic numbers). The program adresses:
- How to extract and validate a file's signature against a list of allowed file types (e.g., .jpg, .pdf).
- Steps to handle files with incorrect signatures, oversized files, or potential security risks.
- Any additional checks to ensure secure file uploads.

In [168]:
# First define the Allowed file types with their signatures(magic numbers), allowed maximum size of the file, and their corresponding extensions.

ALLOWED_FILE_TYPES = {
    "jpg": {"signature": ["FFD8FF"], "max_size": 5 * 1024 * 1024, "extensions":[".jpg",".jpeg"]},  # 5 MB
    "png": {"signature": ["89504E47"], "max_size": 5 * 1024 * 1024, "extensions":[".png"]},  # 5 MB
    "pdf": {"signature": ["25504446"], "max_size": 10 * 1024 * 1024, "extensions":[".pdf"]},  # 10 MB
}

In [169]:
# Read the file signature by reading the unique sequences of bytes at the beginning of a file.

def read_signature(file_path):
    try:
        with open(file_path,'rb') as file:
            signature = file.read(8).hex().upper()
        return signature
    except Exception as e:
        return (f"Error while reading the signature {e}")
        

In [170]:
## Validate the signature using the allowed file types and return the file type.

def validate_signature(signature):
    for file_type, props in ALLOWED_FILE_TYPES.items():
        for sign in props["signature"]:
            if signature.startswith(sign):
                return file_type
    return None

In [171]:
# Validate the extensions
def validate_extention(file_path, valid_file_type):
    extention = (os.path.splitext(file_path))[1]
    
    allowed_extention = ALLOWED_FILE_TYPES[valid_file_type]["extensions"]
    return extention in allowed_extention

In [172]:
## Validating the uploaded file using the functions defined above.

def validate_uploaded_file(file_path):

    # Check if file does exist or not
    if not os.path.exists(file_path):
        return "Error: No file exist."
    
    # Read the signature:
    file_sign = read_signature(file_path)
    if not file_sign:
        return "Error: Unable to read signature."
    
    # Validate the signature
    valid_file_type = validate_signature(file_sign)
    if not valid_file_type:
        return ("Error: Unsupported file type. Please upload the coorect file.")
    

    # Check the size limit of the file
    size = os.path.getsize(file_path)
    max_size = ALLOWED_FILE_TYPES[valid_file_type]["max_size"]

    if size==0:
        return "Error: Empty File."
    elif size> max_size:
        return f"Error: Exceeded the allowed size for {valid_file_type} (max: {max_size/(1024*1024)} MB)."
    
    # Cehck the file extention:
    file = os.path.basename(file_path)
    if not validate_extention(file, valid_file_type):
        return "Error: File extension does not match."
    
    return "File Uploaded successfully."
    
    

In [174]:
print(validate_uploaded_file(r"File-Validator\Aashish_Waghmare_Resume.pdf"))

File Uploaded successfully.


In [176]:
print(validate_uploaded_file(r'File-Validator\IMG_20220403_022831.jpg'))

Error: Exceeded the allowed size for jpg (max: 5.0 MB).
