In [None]:
from neo4j import GraphDatabase
import pandas as pd

# Neo4j connection details
URI = "bolt://localhost:7687"
AUTH = ("neo4j", "company123")  # Replace with your actual password

In [None]:
# Verify connection
def verify_connection():
    try:
        with GraphDatabase.driver(URI, auth=AUTH) as driver:
            driver.verify_connectivity()
            print("Connection established.")
    except Exception as e:
        print(f"Failed to connect to Neo4j: {e}")

In [None]:
# Fetch data from Neo4j
def fetchData():
    query = """
    MATCH (e:Employee)-[:CHARGES]->(t:Time)-[:IS_BILLED_FOR]->(p:Project)
    WHERE p.source <> 'Unknown'
    WITH e, p, SUM(t.hours) AS total_time, AVG(e.rate) AS average_rate
    RETURN p.project_type AS project_type, p.source as source, e.employee_name as employee_name, total_time, average_rate, total_time * average_rate AS total_cost
    ORDER BY total_cost DESC
    """
    
    try:
        with GraphDatabase.driver(URI, auth=AUTH) as driver:
            with driver.session() as session:
                result = session.run(query)
                data = [record.data() for record in result]
        
        return pd.DataFrame(data)
    except Exception as e:
        print(f"Failed to fetch data: {e}")
        return pd.DataFrame()

In [None]:
# Verify connection
verify_connection()

# Fetch and display data
data = fetchData()
if data.empty:
    print("No data found. Check your Neo4j database for the required data.")


In [None]:
print(data.head())

In [None]:
print(data.dtypes)

In [None]:
from sklearn.model_selection import train_test_split
# One-hot encode all categorical variables
categorical_columns = ['project_type', 'source', 'employee_name']  # All categorical columns
encoded_df = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Define features (X) and target (y)
X = encoded_df.drop('total_cost', axis=1)  # Features: all columns except 'total_cost'
y = encoded_df['total_cost']  # Target: 'total_cost'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify shapes
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

In [None]:
# Train Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred) # Using mean squared due to regression problem
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2: .2f}")
print(f"Mean Absolute Error: {mae:.2f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize the results

# Compare predicted Actual vs Predicted Revenue
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5) # Scatter Plot
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) # Line of perfect predictions
plt.xlabel('Actual Revenue')
plt.ylabel('Predicted Revenue')
plt.title('Actual vs Predicted Revenue')
plt.show()