In [1]:
#loading libraries
import os
import json
import re
import xml.etree.ElementTree as ET
from docx import Document
import pandas as pd
import PyPDF2


In [None]:

# Function to extract mobile numbers from text using regular expression
def extract_mobile_numbers(text):
    pattern = r"\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})\b"
    mobile_numbers = re.findall(pattern, text)
    return [''.join(number) for number in mobile_numbers]

# Function to extract mobile numbers from PDF
def extract_mobile_numbers_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    mobile_numbers = extract_mobile_numbers(text)
    return mobile_numbers

# Function to extract mobile numbers from JSON
def extract_mobile_numbers_from_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    text = json.dumps(data)
    mobile_numbers = extract_mobile_numbers(text)
    return mobile_numbers

# Function to extract mobile numbers from XML
def extract_mobile_numbers_from_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    text = ET.tostring(root, encoding='unicode')
    mobile_numbers = extract_mobile_numbers(text)
    return mobile_numbers

# Function to extract mobile numbers from MS Word
def extract_mobile_numbers_from_word(file_path):
    doc = Document(file_path)
    text = ' '.join([paragraph.text for paragraph in doc.paragraphs])
    mobile_numbers = extract_mobile_numbers(text)
    return mobile_numbers

# Function to save mobile numbers to an MS Excel file
def save_mobile_numbers_to_excel(mobile_numbers, output_file):
    df = pd.DataFrame({'Mobile Number': mobile_numbers})
    df.to_excel(output_file, index=False)


In [None]:

# Directory path containing the files
directory_path = r"C:\Users\ARJUN\Music\java\contact data"

# Extracted mobile numbers
mobile_numbers = []

# Iterate through files in the directory
for root, dirs, files in os.walk(directory_path):
    for file_name in files:
        file_path = os.path.join(root, file_name)

        # Extract mobile numbers based on file type
        if file_name.endswith('.pdf'):
            mobile_numbers.extend(extract_mobile_numbers_from_pdf(file_path))
        elif file_name.endswith('.json'):
            mobile_numbers.extend(extract_mobile_numbers_from_json(file_path))
        elif file_name.endswith('.xml'):
            mobile_numbers.extend(extract_mobile_numbers_from_xml(file_path))
        elif file_name.endswith('.docx'):
            mobile_numbers.extend(extract_mobile_numbers_from_word(file_path))

# Save mobile numbers to an MS Excel file
excel_file_path = 'mobile_numbers.xlsx'
save_mobile_numbers_to_excel(mobile_numbers, excel_file_path)