# Creating Dataset Using Faker

In [1]:
import random
import pandas as pd
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker
fake = Faker()

# Define possible transport types and cities
transport_types = ["Offer", "Request", "Unknown"]
cities = [fake.city() for _ in range(10)]
price_range = [round(random.uniform(50, 500), 2) for _ in range(20)]
contact_numbers = [fake.phone_number() for _ in range(30)]

# Function to generate random transport messages
def generate_random_message():
    transport_type = random.choice(transport_types)
    departure_location = random.choice(cities) if transport_type != "Unknown" else None
    arrival_location = random.choice(cities) if transport_type != "Unknown" else None
    departure_date = (datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%d/%m/%Y')
    departure_time = fake.time()
    price = random.choice(price_range) if transport_type == "Offer" else None
    contact_number = random.choice(contact_numbers) if transport_type != "Unknown" else None
    user_name = fake.name()
    user_id = fake.uuid4()

    # Generate message text
    if transport_type == "Offer":
        message = f"Lift offered from {departure_location} to {arrival_location} on {departure_date} at {departure_time}. Contact {contact_number}."
    elif transport_type == "Request":
        message = f"Lift requested from {departure_location} to {arrival_location} on {departure_date}. Please call {contact_number}."
    else:
        message = f"Doing trips between {departure_location} and {arrival_location}. Join our WhatsApp group for more details."
    
    return {
        "text": message,
        "time": datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z'),
        "user_id": user_id,
        "user_name": user_name,
        "departure_location": departure_location,
        "departure_date": departure_date,
        "departure_time": departure_time,
        "arrival_location": arrival_location,
        "price": price,
        "contact_number": contact_number,
        "transport_type": transport_type
    }

# Generate 500 random messages
generated_messages = [generate_random_message() for _ in range(500)]

# Convert the data to a pandas DataFrame
df = pd.DataFrame(generated_messages)

# Save the DataFrame to an Excel file
output_file = "generated_transport_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Data exported to {output_file}")


Data exported to generated_transport_data.xlsx


# Import necessary Libraries

In [None]:
import pandas as pd
import re
import os
from dotenv import load_dotenv
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_watson.natural_language_understanding_v1 import Features, EntitiesOptions, CategoriesOptions
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
load_dotenv()

# Initialize IBM Watson NLU Service

In [None]:
def initialize_ibm_watson_nlu(api_key, service_url):
    authenticator = IAMAuthenticator(api_key)
    nlu = NaturalLanguageUnderstandingV1(
        version='2023-09-26',
        authenticator=authenticator
    )
    nlu.set_service_url(service_url)
    return nlu

# Clean text messages (remove emojis, special characters, etc.)

In [None]:
def clean_message(message):
    # Remove emojis and other non-standard characters
    message = re.sub(r'[^\x00-\x7F]+', '', message)
    # Remove special characters except abbreviations like FL, GEN (all caps)
    message = re.sub(r'[^a-zA-Z0-9\s]', '', message)
    return message

# Send message to IBM NLU for information extraction

In [None]:
def analyze_message(nlu, message):
    try:
        response = nlu.analyze(
            text=message,
            features=Features(
                entities=EntitiesOptions(emotion=False, sentiment=False, limit=10),
                categories=CategoriesOptions(limit=1)
            )
        ).get_result()
        return response
    except Exception as e:
        print(f"Error processing message: {message}, Error: {str(e)}")
        return None

# Extract phone number, date, time from the message

In [None]:
def extract_information(nlu_response):
    phone_number, date, time = None, None, None  # Initialize variables
    entities = nlu_response.get('entities', [])
    for entity in entities:
        if entity['type'] == 'PhoneNumber':
            phone_number = entity['text']
        elif entity['type'] == 'Date':
            date = entity['text']
        elif entity['type'] == 'Time':
            time = entity['text']
    return phone_number, date, time

# Categorize message as 'request' or 'offer'

In [None]:
def categorize_message(message):
    message = message.lower()
    if 'request' in message or 'need a ride' in message:
        return 'request'
    elif 'offer' in message or 'giving a ride' in message:
        return 'offer'
    else:
        return 'unknown'

# Match similar requests and offers (simple matching based on location)

In [None]:
def match_requests_and_offers(df):
    matched_rides = []
    requests = df[df['Category'] == 'request']
    offers = df[df['Category'] == 'offer']

    for _, req in requests.iterrows():
        for _, offer in offers.iterrows():
            if req['Departure Location'] == offer['Departure Location'] and req['Arrival Location'] == offer['Arrival Location']:
                matched_rides.append({
                    'Request': req['Message'],
                    'Offer': offer['Message'],
                    'Request Phone': req['Phone Number'],
                    'Offer Phone': offer['Phone Number'],
                    'Departure': req['Departure Location'],
                    'Arrival': req['Arrival Location']
                })
    return matched_rides

# Main function to process messages

In [None]:
def process_carpool_messages(file_path, api_key, service_url):
    # Initialize NLU
    nlu = initialize_ibm_watson_nlu(api_key, service_url)

    # Load messages from Excel
    df = pd.read_excel(file_path)

    # Clean messages and extract information
    results = []
    for i, row in df.iterrows():
        message = clean_message(row['text'])
        nlu_response = analyze_message(nlu, message)

        # Skip if NLU response is None
        if nlu_response is None:
            continue

        phone_number, date, time = extract_information(nlu_response)
        category = categorize_message(message)
        
        # For simplicity, extracting location based on assumption
        departure_location = re.search(r'(from\s+[A-Z]+)', message)
        arrival_location = re.search(r'(to\s+[A-Z]+)', message)

        results.append({
            'Message': message,
            'Phone Number': phone_number,
            'Date': date,
            'Time': time,
            'Category': category,
            'Departure Location': departure_location.group(1) if departure_location else None,
            'Arrival Location': arrival_location.group(1) if arrival_location else None
        })

    # Convert results into DataFrame
    result_df = pd.DataFrame(results)

    # Match requests and offers
    matched_rides = match_requests_and_offers(result_df)

    return result_df, matched_rides

# Running the code

In [12]:
if __name__ == '__main__':
    API_KEY = os.getenv("IBM_API_KEY")
    SERVICE_URL = 'https://api.eu-de.natural-language-understanding.watson.cloud.ibm.com/instances/a23cfb67-2e4e-40d4-87ac-9a74b7e94f96'
    FILE_PATH = 'Tests Extract data 250924.xlsx'  # Path to your Excel file containing messages

    processed_data, matched_rides = process_carpool_messages(FILE_PATH, API_KEY, SERVICE_URL)
    
    # Print matched rides
    if matched_rides:
        for ride in matched_rides:
            print(f"Request: {ride['Request']}, Offer: {ride['Offer']}, Departure: {ride['Departure']}, Arrival: {ride['Arrival']}")
    else:
        print("No matched rides found.")


Error processing message: MOLWENi MANDIQALE NGE FOWNI KWEZELA NINGALAHLEKE CALLWTSPP 0665757443 IGAMA LAM NDINGU AYANDA MASILE NDIHLALA DUBARN EKHAYA KUSE EASTERN CAPE EMTHATHA TRANSKEI NDIFUNA KUNIXELELA NINA NONKE KUBA NDIHLUPHEKE IXESHA ELIDE  NGIKHANGELA U NCEDO LWEMALI KODWA BENDILAHLE NJE IMALI ZAMI EMVA KOKUBONA IPOST YOMNYE USISI NDILEKA APA KU FACEBOOK UMNCOMA UBABA UKHUMALO NAM NDATI MANDIMZAME  NDATHETHA NAYE UBABA UKHUMALO NYAN UBABA UKhUMALO WACELA ACCOUNT YAM NDAMNIKA NYAN UTHI NDILINDE 32 MINUTES NDABONA MESSAGE INGENA KWI ACCOUNT YAM KWANGENA R 38 MILLION YHO KHUBULELWA KWAM NDAMBHATALA 600 THAWUSANDE SO XA NIFUNA NANILUNCEDO PLZ  UBABA Khumalo CALL WHTSPP 0665757443  KWABANYE FOWNELA UBABA   KhumAL0 CALL OR WHATSAPP 0665757443 Makhosi, Error: Error: unsupported text language: xh, Status code: 400 , X-global-transaction-id: b776f6ab-ff88-452b-a799-793ca0092d92
Error processing message: 0826275920Molweni igama lam ndingu Foad King
 ndise Kapa kodwa ekhaya kuse Mthatha  n