In [None]:
import pandas as pd

df=pd.read_csv('tows.csv')
ds=pd.read_csv('/content/top_10_lots-2.csv')
ranking_dataset=pd.read_csv('/content/rankingdataset_with_distance-2(1).csv')
NOP = pd.read_csv('NOP_top10.csv')

In [None]:
# Step 1: Clean and normalize 'Trade Name' and 'Merchant' columns
def clean_name(name):
    if pd.isna(name):
        return name
    return (
        name.lower()
        .replace(", inc.", "")
        .replace(" inc.", "")
        .replace(" llc", "")
        .replace(", llc", "")
        .strip()
    )

df['Cleaned_Trade_Name'] = df['Trade Name'].apply(clean_name)
ds['Cleaned_Merchant'] = ds['Merchant'].apply(clean_name)

# Step 2: Manually adjust specific cases for normalization
# Map "EC Towing Inc." or similar to "EC"
df.loc[df['Cleaned_Trade_Name'].str.contains(r'\bec\b', na=False), 'Cleaned_Trade_Name'] = 'ec'
ds.loc[ds['Cleaned_Merchant'].str.contains(r'\bec\b', na=False), 'Cleaned_Merchant'] = 'ec'

# Step 3: Filter main dataset for rows matching the merchants in ds
filtered_df = df[df['Cleaned_Trade_Name'].isin(ds['Cleaned_Merchant'])]

# Step 4: Get unique trade names after normalization
unique_trade_names = filtered_df['Cleaned_Trade_Name'].unique()

# Step 5: Display results
print(f"Number of entries in the final filtered DataFrame: {filtered_df.shape[0]}")
print(f"Number of unique trade names: {len(unique_trade_names)}")
print("Unique Trade Names:")
print(unique_trade_names)

# Save the filtered DataFrame to a CSV file (optional)
filtered_df.to_csv('filtered_tows_with_top_10.csv', index=False)
print("Filtered data saved to 'filtered_tows_with_top_10.csv'.")

In [None]:
#  lets create another column which is basically  is equal to heversine distance of (lon,lat) and (lon_station,lat_station) multiply by 1.4 remember this is all in miles

import pandas as pd
from math import radians, sin, cos, sqrt, atan2

def haversine_distance(lon1, lat1, lon2, lat2):
    """
    Calculate the Haversine distance between two points on Earth.
    """
    R = 3958.8  # Radius of the Earth in miles

    dlon = radians(lon2 - lon1)
    dlat = radians(lat2 - lat1)
    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return distance



In [None]:
# Load the NOP DataFrame (assuming it's already created as in your code)
NOP = pd.read_csv('NOP_top10.csv')

# Calculate the Haversine distance and create the new column
NOP['distance1.4'] = NOP.apply(lambda row: haversine_distance(row['lon'], row['lat'], row['lot_station'], row['lat_station']) * 1.4, axis=1)



# Save the updated DataFrame to a new CSV file (optional)
NOP.to_csv('NOP_with_distance.csv', index=False)

In [None]:


# Helper function to normalize values
def normalize(column):
    """Normalize a column to a scale of 0 to 1."""
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

# Helper function to group and calculate normalized scores
def calculate_normalized_score(data, group_col, value_col, reference_value=None, deviation=False):
    """
    Group data by `group_col`, calculate the average of `value_col`,
    and optionally compute the deviation from a reference value, then normalize.
    """
    grouped = data.groupby(group_col)[value_col].mean().reset_index()
    if deviation and reference_value is not None:
        grouped[f"{value_col} Deviation"] = abs(grouped[value_col] - reference_value)
        grouped[f"{value_col}Score"] = normalize(grouped[f"{value_col} Deviation"])
    else:
        grouped[f"{value_col}Score"] = normalize(grouped[value_col])
    return grouped[[group_col, f"{value_col}Score"]]

# Load the dataset
data = NOP

# Step 1: Calculate and normalize `distance1.4` scores
average_distance = calculate_normalized_score(
    data, group_col="Cleaned_Trade_Name", value_col="distance1.4"
)

# Step 2: Calculate and normalize income deviation scores
montgomery_median_income = 125371
average_income = calculate_normalized_score(
    data,
    group_col="Cleaned_Trade_Name",
    value_col="median_household_income",
    reference_value=montgomery_median_income,
    deviation=True
)

# Step 3: Calculate and normalize English ratio deviation scores
data["Speak Only English Ratio"] = data["speak_only_english"] / data["total_population"]
montgomery_english_ratio = 0.566
average_english_ratio = calculate_normalized_score(
    data,
    group_col="Cleaned_Trade_Name",
    value_col="Speak Only English Ratio",
    reference_value=montgomery_english_ratio,
    deviation=True
)

# Step 4: Calculate and normalize CEI scores
average_cei = data.groupby("Cleaned_Trade_Name")["cei"].mean().reset_index()
average_cei["Absolute CEI"] = average_cei["cei"].abs()
average_cei["ceiScore"] = normalize(average_cei["Absolute CEI"])

# Step 5: Merge all normalized scores into a single dataframe
final_ranking = average_distance.merge(
    average_income, on="Cleaned_Trade_Name"
).merge(
    average_english_ratio, on="Cleaned_Trade_Name"
).merge(
    average_cei[["Cleaned_Trade_Name", "ceiScore"]], on="Cleaned_Trade_Name"
)

# Step 6: Calculate Overall Weighted Score
# Define weights for each factor
weights = {
    "distance1.4Score": 0.25,
    "median_household_incomeScore": 0.25,
    "Speak Only English RatioScore": 0.25,
    "ceiScore": 0.25,
}

final_ranking["overallScore"] = (
    final_ranking["distance1.4Score"] * weights["distance1.4Score"] +
    final_ranking["median_household_incomeScore"] * weights["median_household_incomeScore"] +
    final_ranking["Speak Only English RatioScore"] * weights["Speak Only English RatioScore"] +
    final_ranking["ceiScore"] * weights["ceiScore"]
)

# Step 7: Rank the Companies
final_ranking["rank"] = final_ranking["overallScore"].rank(ascending=False).astype(int)

# Step 8: Sort by Rank
final_ranking = final_ranking.sort_values(by="rank")

# Step 9: Save Final Ranked Data
output_path = "NOP_ranked_100.csv"
final_ranking.to_csv(output_path, index=False)

print(f"Final ranked data (including average CEI and rankings) has been saved to {output_path}")



In [None]:
import pandas as pd

# Helper function to normalize values
def normalize(column, scale=100, reverse=False):
    """
    Normalize a column to a scale of 0 to `scale`. Optionally reverse the score.
    """
    min_val = column.min()
    max_val = column.max()
    normalized = (column - min_val) / (max_val - min_val)
    if reverse:
        normalized = 1 - normalized
    return normalized * scale

# Helper function to group and calculate normalized scores
def calculate_normalized_score(data, group_col, value_col, reference_value=None, deviation=False, scale=100, reverse=False):
    """
    Group data by `group_col`, calculate the average of `value_col`,
    optionally compute the deviation from a reference value, then normalize.
    """
    grouped = data.groupby(group_col)[value_col].mean().reset_index()
    if deviation and reference_value is not None:
        grouped[f"{value_col} Deviation"] = abs(grouped[value_col] - reference_value)
        grouped[f"{value_col}Score"] = normalize(
            grouped[f"{value_col} Deviation"], scale=scale, reverse=reverse
        )
    else:
        grouped[f"{value_col}Score"] = normalize(grouped[value_col], scale=scale, reverse=reverse)
    return grouped[[group_col, f"{value_col}Score"]]

# Clean the dataset by dropping rows with missing values in relevant columns
columns_to_check = ["distance1.4", "cei", "median_household_income", "speak_only_english", "total_population"]
ranking_dataset_cleaned = ranking_dataset.dropna(subset=columns_to_check)

# --- Distance Score Calculation ---
average_distance = calculate_normalized_score(
    ranking_dataset_cleaned, group_col="Trade Name", value_col="distance1.4", scale=100, reverse=True
)

# --- CEI Score Calculation ---
average_cei = calculate_normalized_score(
    ranking_dataset_cleaned, group_col="Trade Name", value_col="cei", scale=100, deviation=True, reverse=True
)

# --- English Score Calculation ---
ranking_dataset_cleaned["Speak Only English Ratio"] = (
    ranking_dataset_cleaned["speak_only_english"] / ranking_dataset_cleaned["total_population"]
)
average_english_ratio = calculate_normalized_score(
    ranking_dataset_cleaned,
    group_col="Trade Name",
    value_col="Speak Only English Ratio",
    reference_value=0.566,  # Montgomery County's percentage
    scale=100,
    deviation=True,
    reverse=True
)

# --- Income Score Calculation ---
average_income = calculate_normalized_score(
    ranking_dataset_cleaned,
    group_col="Trade Name",
    value_col="median_household_income",
    reference_value=125371,  # Montgomery County's median income
    scale=100,
    deviation=True,
    reverse=True
)

# --- Combine All Scores ---
final_ranking = average_distance.merge(
    average_cei, on="Trade Name"
).merge(
    average_english_ratio, on="Trade Name"
).merge(
    average_income, on="Trade Name"
)

# --- Calculate Overall Weighted Score ---
weights = {
    "distance1.4Score": 0.25,
    "ceiScore": 0.25,
    "Speak Only English RatioScore": 0.25,
    "median_household_incomeScore": 0.25,
}

final_ranking["overallScore"] = (
    final_ranking["distance1.4Score"] * weights["distance1.4Score"] +
    final_ranking["ceiScore"] * weights["ceiScore"] +
    final_ranking["Speak Only English RatioScore"] * weights["Speak Only English RatioScore"] +
    final_ranking["median_household_incomeScore"] * weights["median_household_incomeScore"]
)

# --- Rank the Companies ---
final_ranking["rank"] = final_ranking["overallScore"].rank(ascending=False).astype(int)

# --- Sort by Rank ---
final_ranking = final_ranking.sort_values(by="rank")

# --- Save Final Ranked Data ---
output_grouped_path = "General_ranking_100.csv"
final_ranking.to_csv(output_grouped_path, index=False)

print(f"Final ranked data has been saved to {output_grouped_path}")
