In [1]:
import re
from contextlib import ExitStack

def parse_line(line):
    timestamp_pattern = r'^(\d{2}:\d{2}:\d{2})\s+(.*?):\s+(.*)$'
    match = re.match(timestamp_pattern, line)
    return match.groups() if match else None

def merge_sort(file_paths, output_file_path):
    combined_data = set()

    with ExitStack() as stack:
        try:
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                for file_path in file_paths:
                    try:
                        with stack.enter_context(open(file_path, 'r', encoding='utf-8')) as file:
                            content = file.readlines()
                            for line in content:
                                parsed_line = parse_line(line)
                                if parsed_line:
                                    combined_data.add(parsed_line)

                    except Exception as e:
                        print(f"Error processing file {file_path}: {e}")

                sorted_data = sorted(combined_data, key=lambda x: x[0])
                for entry in sorted_data:
                    output_file.write(f"{entry[0]} {entry[1]}: {entry[2]}\n")

        except Exception as e:
            print(f"Error writing to output file {output_file_path}: {e}")

    return combined_data

file_paths = [f"./recovery_{i}/GMT20240125-121850_RecordingnewChat - Copy ({i}).txt" for i in range(1, 16)]

output_file_path = "Framed_data.txt"

combined_data = merge_sort(file_paths, output_file_path)


In [2]:
import re

def extract_linkedin_urls(combined_data):
    all_urls = []
    validated_urls = []
    linkedin_pattern = re.compile(r'(https?://)?(www\.)?linkedin\.com/(in|feed)/([\w-]+)?')

    for entry in combined_data:
        matches = linkedin_pattern.findall(entry[2])
        for match in matches:
            if match[3] is not None:
                linkedin_url = f"{match[0] if match[0] else 'https://'}{match[1] if match[1] else 'www.'}linkedin.com/{match[2]}/{match[3]}"
                validated_urls.append(linkedin_url)
                raw_url = f"{match[0]}{match[1]}linkedin.com/{match[2]}{match[3]}"
                all_urls.append(raw_url)

    return all_urls, validated_urls

all_urls, validated_urls = extract_linkedin_urls(combined_data)
print("Validated LinkedIn URLs found: ", len(validated_urls)) 

print("All LinkedIn URLs:")
for raw_url in all_urls:
    print(raw_url)

print("Validated LinkedIn URLs:")
for validated_url in validated_urls:
    print(validated_url)


Validated LinkedIn URLs found:  468
All LinkedIn URLs:
https://www.linkedin.com/inyashkhandagale
https://www.linkedin.com/inkarri-dharma-teja-b7b575206
www.linkedin.com/insnehitha-tadapaneni-b8bb52201
https://www.linkedin.com/inaravinth-b
https://www.linkedin.com/inarun-kumar-vollala-78713518b
https://www.linkedin.com/inkartikdhanai
www.linkedin.com/inpranavbhawsar
https://www.linkedin.com/inraj-kumarsaggurthi
https://www.linkedin.com/innishantdandwate000
https://www.linkedin.com/inmohd-sardar-4a9867221
https://www.linkedin.com/inswastikdubey123
https://www.linkedin.com/inanushka-yeole
https://www.linkedin.com/inashutosh-ramnath-pawar
https://www.linkedin.com/inkunalkumavat
https://www.linkedin.com/indeepak-kumar-602352224
https://www.linkedin.com/ins-dinesh-kumar-b0284822a
https://www.linkedin.com/inbheesettianand
https://www.linkedin.com/injkpratik99
https://www.linkedin.com/invaishnavi-voleti-b55278238
https://www.linkedin.com/injkpratik99
https://www.linkedin.com/iniman-fasasi-7166

In [3]:
import re

def extract_linkedin_urls(combined_data):
    validated_urls = []
    all_urls = []
    linkedin_pattern = re.compile(r'(https?://)?(www\.)?linkedin\.com/in/([\w-]+)')

    for entry in combined_data:
        matches = linkedin_pattern.findall(entry[2])
        for match in matches:
            linkedin_url = f"{match[0] if match[0] else 'https://'}{match[1] if match[1] else 'www.'}linkedin.com/in/{match[2]}"
            validated_urls.append(linkedin_url)
            
            raw_url = f"{match[0]}{match[1]}linked.com/in/{match[2]}"
            all_urls.append(raw_url)

    return validated_urls, all_urls

validated_urls, all_urls = extract_linkedin_urls(combined_data)
print("Validated LinkedIn profiles according to the instruction : ", len(validated_urls))

print("\n All LinkedIn URLs: \n")
for url in all_urls:
    print(url)

unique_urls_without_https = list(set(all_urls))
unique_urls_with_https = list(set(validated_urls))
print("\n No of unique LinkedIn URLs :", len(unique_urls_without_https))
print("\n No of unique LinkedIn URLs :", len(unique_urls_with_https))


Validated LinkedIn profiles according to the instruction :  452

 All LinkedIn URLs: 

https://www.linked.com/in/yashkhandagale
https://www.linked.com/in/karri-dharma-teja-b7b575206
www.linked.com/in/snehitha-tadapaneni-b8bb52201
https://www.linked.com/in/aravinth-b
https://www.linked.com/in/arun-kumar-vollala-78713518b
https://www.linked.com/in/kartikdhanai
www.linked.com/in/pranavbhawsar
https://www.linked.com/in/raj-kumarsaggurthi
https://www.linked.com/in/nishantdandwate000
https://www.linked.com/in/mohd-sardar-4a9867221
https://www.linked.com/in/swastikdubey123
https://www.linked.com/in/anushka-yeole
https://www.linked.com/in/ashutosh-ramnath-pawar
https://www.linked.com/in/kunalkumavat
https://www.linked.com/in/deepak-kumar-602352224
https://www.linked.com/in/s-dinesh-kumar-b0284822a
https://www.linked.com/in/bheesettianand
https://www.linked.com/in/jkpratik99
https://www.linked.com/in/vaishnavi-voleti-b55278238
https://www.linked.com/in/jkpratik99
https://www.linked.com/in/iman-

In [4]:
import re


file_path = 'Framed_data.txt'
def extract_zoom_links(text):
    zoom_pattern = re.compile(r'https://us02web.zoom\.us/(j|w)/[0-9]+')
    lines = text.split('\n')
    
    matching_lines = [line for line in lines if zoom_pattern.search(line)]

    return matching_lines

def print_misconduct_occurrences(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            file_content = file.read()
    except FileNotFoundError:
        print(f"Error: File not found - {file_path}")
        return
    except Exception as e:
        print(f"Error reading file: {e}")
        return

    matching_lines = extract_zoom_links(file_content)
    
    count = 0
    if matching_lines:
        print("Misconduct occurrences:")
        for line in matching_lines:
            print(line)
            count+= 1
    print("Total number of misconduct : ",count)

print_misconduct_occurrences(file_path)

Misconduct occurrences:
01:33:12 RAJ KUMAR SAGGURTHI: https://us02web.zoom.us/w/82416269753?tk=XAfCCgVa2dUXqlUTrzuYY2Ehl4uhwPQMSYBvfjLoJGk.DQYAAAATMGR5uRZwY0tzeHhpa1NDMlRndEtLaElqTnB3AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA&pwd=d09OQWRabGFrcTBDbE9mam02ZW4rQT09
Total number of misconduct :  1


In [5]:
import requests
from bs4 import BeautifulSoup

def validate_linkedin_profiles(linkedin_urls):

    validated_profiles = []

    for url in linkedin_urls:
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
            response = requests.get(url, headers=headers)
            response.raise_for_status()

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')

                if is_valid_linkedin_profile(soup):
                    validated_profiles.append(url)
        
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")

    return validated_profiles

def is_valid_linkedin_profile(soup):
    profile_name_element = soup.find('h1', {'class': 'text-heading-xlarge inline t-24 v-align-middle break-words'})
    return profile_name_element is not None

linkedin_profiles_to_check = ["https://www.linkedin.com/in/example1", "https://www.linkedin.com/in/example2"]
validated_profiles = validate_linkedin_profiles(linkedin_profiles_to_check)

print("Validated LinkedIn profiles")
for profile_url in validated_profiles:
    print(profile_url)
    print("hello")


Validated LinkedIn profiles
