##### Copyright 2019 The TensorFlow Authors.


In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Load CSV data

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/tutorials/load_data/csv"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/load_data/csv.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/docs/blob/master/site/en/tutorials/load_data/csv.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/docs/site/en/tutorials/load_data/csv.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
</table>

This tutorial provides examples of how to use CSV data with TensorFlow.

There are two main parts to this:

1. **Loading the data off disk**
2. **Pre-processing it into a form suitable for training.**

This tutorial focuses on the loading, and gives some quick examples of preprocessing. To learn more about the preprocessing aspect, check out the [Working with preprocessing layers](https://www.tensorflow.org/guide/keras/preprocessing_layers) guide and the [Classify structured data using Keras preprocessing layers](../structured_data/preprocessing_layers.ipynb) tutorial.


## Setup

In [2]:
import pandas as pd
import numpy as np

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers

## In memory data

For any small CSV dataset the simplest way to train a TensorFlow model on it is to load it into memory as a [pandas `DataFrame`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) or a NumPy array.


A relatively simple example is the [abalone dataset](https://archive.ics.uci.edu/ml/datasets/abalone).

* The dataset is small.
* All the input features are limited-range floating point values.

Here is how to download the data into a `DataFrame`:

In [3]:
import pandas as pd

abalone_train = pd.read_csv(
    "analytics.csv",
    names=["id", "page_name", "page_title", "visit_date", "visit_count",
           "country", "countryCode", "region", "regionName", "city", "zip",
           "lat", "lon", "timezone", "isp", "org", "as"]
)

abalone_train.head()

Unnamed: 0,id,page_name,page_title,visit_date,visit_count,country,countryCode,region,regionName,city,zip,lat,lon,timezone,isp,org,as
0,id,page_name,page_title,visit_date,visit_count,country,countryCode,region,regionName,city,zip,lat,lon,timezone,isp,org,as
1,9,https://enally.in/contact,Enally - Contact Us,2024-05-09,2,India,IN,HP,Himachal Pradesh,Solan,173229,30.908500,77.102200,Asia/Kolkata,Bharti Airtel Limited,Bharti Airtel Limited,AS45609 Bharti Airtel Ltd. AS for GPRS Service
2,10,https://enally.in/,Enally - Entrepreneur's Abode of Alliance!,2024-05-09,7,Canada,CA,ON,Ontario,Guelph,N1E,43.569800,-80.242100,America/Toronto,Rogers Communications Canada Inc.,Rogers Cable Inc. YM,AS812 Rogers Communications Canada Inc.
3,11,https://enally.in/,Enally - Entrepreneur's Abode of Alliance!,2024-05-09,3,India,IN,HP,Himachal Pradesh,Solan,173229,30.908500,77.102200,Asia/Kolkata,Bharti Airtel Limited,Bharti Airtel Limited,AS45609 Bharti Airtel Ltd. AS for GPRS Service
4,12,https://enally.in/projects-list,Enally - Projects List,2024-05-09,1,India,IN,HP,Himachal Pradesh,Solan,173229,30.908500,77.102200,Asia/Kolkata,Bharti Airtel Limited,Bharti Airtel Limited,AS45609 Bharti Airtel Ltd. AS for GPRS Service


In [7]:
# prompt: Using dataframe abalone_train: page_name vs visit_count vs visit_count

import altair as alt
alt.data_transformers.enable("vegafusion")
# Create the Altair chart
# Sample the data
sampled_data = abalone_train.sample(n=5000, random_state=42)

# Or aggregate the data
aggregated_data = abalone_train.groupby('page_name')['visit_count'].sum().reset_index()

# Then use the sampled or aggregated data in your chart
chart = alt.Chart(sampled_data).mark_circle().encode(
    x='page_name',
    y='visit_count',
    tooltip=['page_name', 'visit_count']
).interactive()


The dataset contains a set of measurements of [abalone](https://en.wikipedia.org/wiki/Abalone), a type of sea snail.

![an abalone shell](https://tensorflow.org/images/abalone_shell.jpg)

 [“Abalone shell”](https://www.flickr.com/photos/thenickster/16641048623/) (by [Nicki Dugan Pogue](https://www.flickr.com/photos/thenickster/), CC BY-SA 2.0)


In [15]:
!pip install "vl-convert-python>=1.6.0"

Collecting vl-convert-python>=1.6.0
  Downloading vl_convert_python-1.7.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Downloading vl_convert_python-1.7.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.1/30.1 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vl-convert-python
Successfully installed vl-convert-python-1.7.0


In [13]:
conda install -c conda-forge "vegafusion-python-embed>=1.5.0" "vegafusion>=1.5.0"

ValueError: The python kernel does not appear to be a conda environment.  Please use ``%pip install`` instead.

In [16]:
# prompt: Generate code based on above code and data

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import altair as alt

# Load the dataset
abalone_train = pd.read_csv(
    "analytics.csv",
    names=["id", "page_name", "page_title", "visit_date", "visit_count",
           "country", "countryCode", "region", "regionName", "city", "zip",
           "lat", "lon", "timezone", "isp", "org", "as"]
)

# Display the first few rows
print(abalone_train.head())

# Enable VegaFusion data transformer for Altair
alt.data_transformers.enable("vegafusion")

# Sample the data for visualization (optional, but recommended for large datasets)
sampled_data = abalone_train.sample(n=min(5000, len(abalone_train)), random_state=42)

# Create an interactive Altair chart
chart = alt.Chart(sampled_data).mark_circle().encode(
    x='visit_count',  # Use visit_count on x-axis
    y='page_name',  # Use page_name on y-axis
    color='region',  # Color points by region, if available in the data
    tooltip=['page_name', 'visit_count', 'region']  # Show relevant information in tooltip
).interactive()

# Display the chart
chart.display()


# Example of data aggregation and visualization
aggregated_data = abalone_train.groupby('page_name')['visit_count'].sum().reset_index()

bar_chart = alt.Chart(aggregated_data).mark_bar().encode(
    x='page_name',
    y='visit_count'
).properties(
    width=600,
    height=400,
    title="Total Visit Counts per Page"
).interactive()

bar_chart.display()


   id                        page_name  \
0  id                        page_name   
1   9        https://enally.in/contact   
2  10               https://enally.in/   
3  11               https://enally.in/   
4  12  https://enally.in/projects-list   

                                   page_title  visit_date  visit_count  \
0                                  page_title  visit_date  visit_count   
1                         Enally - Contact Us  2024-05-09            2   
2  Enally - Entrepreneur's Abode of Alliance!  2024-05-09            7   
3  Enally - Entrepreneur's Abode of Alliance!  2024-05-09            3   
4                      Enally - Projects List  2024-05-09            1   

   country  countryCode  region        regionName    city     zip        lat  \
0  country  countryCode  region        regionName    city     zip        lat   
1    India           IN      HP  Himachal Pradesh   Solan  173229  30.908500   
2   Canada           CA      ON           Ontario  Guelph     

In [17]:
# prompt: Generate proper graph for pages vs country and region vs counts

# Assuming 'abalone_train' DataFrame is already loaded as in the provided code.

import altair as alt

# Enable VegaFusion data transformer for Altair
alt.data_transformers.enable("vegafusion")

# Sample the data for visualization (optional, but recommended for large datasets)
sampled_data = abalone_train.sample(n=min(5000, len(abalone_train)), random_state=42)

# --- Pages vs Country ---
# Create a chart showing the relationship between pages and countries
page_country_chart = alt.Chart(sampled_data).mark_bar().encode(
    x='country:N',  # Use country as a nominal variable on x-axis
    y='count()',  # Count the number of occurrences for each country
    color='page_name:N',  # Color the bars by page_name
    tooltip=['country', 'page_name', 'count()']  # Show relevant information in tooltip
).properties(
    width=600,
    height=400,
    title="Page Visits by Country"
).interactive()


# --- Region vs Visit Counts ---
region_counts_chart = alt.Chart(sampled_data).mark_bar().encode(
    x='regionName:N',
    y='sum(visit_count):Q',  # Sum of visit_counts for each region
    color='region:N',  # Color the bars by region
    tooltip=['regionName', 'sum(visit_count)', 'region']
).properties(
    width=600,
    height=400,
    title="Total Visit Counts by Region"
).interactive()


# Display the charts
page_country_chart.display()
region_counts_chart.display()

In [20]:
# prompt: Help me creating a trained model using these data what can help me taking decision which page and what users are liking

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier  # Example model
from sklearn.metrics import classification_report

# Load the dataset
abalone_train = pd.read_csv(
    "analytics.csv",
    names=["id", "page_name", "page_title", "visit_date", "visit_count",
           "country", "countryCode", "region", "regionName", "city", "zip",
           "lat", "lon", "timezone", "isp", "org", "as"]
)

# Data preprocessing
# 1. Feature selection: Select relevant features
features = ['page_name', 'visit_count', 'country', 'region', 'city']
X = abalone_train[features]

# 2. Handle categorical features (one-hot encoding or label encoding)
label_encoders = {}
for col in ['page_name', 'country', 'region', 'city']:
    le = LabelEncoder()
    # Fit on all unique values, including those in new_data_point
    le.fit(pd.concat([X[col], pd.Series(['US', 'CA', 'San Francisco'])], ignore_index=True))
    X[col] = le.transform(X[col])
    label_encoders[col] = le  # Store the encoders for later use

# 3. Handle missing values (if any)
X.fillna(0, inplace=True)

# 4. Target variable:  Let's predict 'page_name' for example
y = X['page_name']
X = X.drop('page_name', axis=1)

# Convert 'visit_count' to numeric, handling errors
X['visit_count'] = pd.to_numeric(X['visit_count'], errors='coerce')
# Fill NaN values created during conversion with 0
X['visit_count'].fillna(0, inplace=True)

# 5. Feature scaling (important for some models)
scaler = StandardScaler()
X = scaler.fit_transform(X)


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training (example: RandomForestClassifier)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


# Example prediction:
# Prepare a new data point for prediction
new_data_point = pd.DataFrame({
    'visit_count': [5],  # Replace with actual values
    'country': ['US'],  # Replace with actual values
    'region': ['CA'],  # Replace with actual values
    'city': ['San Francisco']  # Replace with actual values
})

# Apply the same preprocessing as before
for col in ['country', 'region', 'city']:
  new_data_point[col] = label_encoders[col].transform(new_data_point[col])

new_data_point = scaler.transform(new_data_point)


predicted_page = model.predict(new_data_point)
predicted_page_name = label_encoders['page_name'].inverse_transform(predicted_page)
print(f"Predicted page name: {predicted_page_name}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value 

              precision    recall  f1-score   support

           3       0.43      0.81      0.56       552
           4       0.50      0.14      0.22         7
           5       0.00      0.00      0.00        17
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         4
          10       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         0
          17       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         0
          19       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         1
          28       0.00      0.00      0.00         0
          31       0.00      0.00      0.00         1
          34       0.00      0.00      0.00         1
          35       0.00      0.00      0.00         1
          37       0.00      0.00      0.00         0
          39       0.00      0.00      0.00         0
          40       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
# prompt: Based on above progress continue the model training and proceed with the model traning

# Assuming 'abalone_train', 'X', 'y', 'X_train', 'X_test', 'y_train', 'y_test', 'model', 'scaler', and 'label_encoders' are already defined as in the previous code.

# Further model training (example: increase the number of estimators)
# You can experiment with different hyperparameters or use a different model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42) # Example: Increased n_estimators
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Example prediction with new data:
new_data_point = pd.DataFrame({
    'visit_count': [10],
    'country': ['CA'],
    'region': ['CA'],
    'city': ['Toronto']
})

for col in ['country', 'region', 'city']:
    new_data_point[col] = label_encoders[col].transform(new_data_point[col])

new_data_point = scaler.transform(new_data_point)
predicted_page = model.predict(new_data_point)
predicted_page_name = label_encoders['page_name'].inverse_transform(predicted_page)
print(f"Predicted page name: {predicted_page_name}")


#Further training iterations with different hyperparameters
model2 = RandomForestClassifier(n_estimators = 300, random_state = 42)
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
print(classification_report(y_test, y_pred))


#Tensorflow model
# Define the model
model_tf = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    # Ensure the output layer has enough neurons to cover all possible labels
    layers.Dense(len(label_encoders['page_name'].classes_), activation='softmax')
])

# Compile the model
model_tf.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model_tf.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

loss, accuracy = model_tf.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           3       0.42      0.93      0.58       552
           4       0.50      0.14      0.22         7
           5       0.00      0.00      0.00        17
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         4
          10       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         1
          25       0.00      0.00      0.00         0
          31       0.00      0.00      0.00         1
          34       0.00      0.00      0.00         1
          35       0.00      0.00      0.00         1
          37       0.00      0.00      0.00         0
          40       0.00      0.00      0.00         0
          41       0.00      0.00      0.00         1
          44       0.00      0.00      0.00         1
          45       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.1786 - loss: 4.5946 - val_accuracy: 0.3294 - val_loss: 2.7816
Epoch 2/10
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.3686 - loss: 2.5851 - val_accuracy: 0.3294 - val_loss: 2.7259
Epoch 3/10
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3863 - loss: 2.5086 - val_accuracy: 0.3336 - val_loss: 2.6983
Epoch 4/10
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3725 - loss: 2.4832 - val_accuracy: 0.3345 - val_loss: 2.6985
Epoch 5/10
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.3850 - loss: 2.4516 - val_accuracy: 0.3311 - val_loss: 2.7010
Epoch 6/10
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3806 - loss: 2.4506 - val_accuracy: 0.3328 - val_loss: 2.6834
Epoch 7/10
[1m148/148[0m [32m━━━━━━

In [23]:
# prompt: Improve the above test and create something when Asked which place, ip, location provided is more used or etc it should predict or give the answer

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import altair as alt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# ... (Your existing code for data loading and preprocessing) ...

# Enhanced prediction function
def predict_page(model, scaler, label_encoders, visit_count, country, region, city):
    new_data_point = pd.DataFrame({
        'visit_count': [visit_count],
        'country': [country],
        'region': [region],
        'city': [city]
    })

    for col in ['country', 'region', 'city']:
        try:
            new_data_point[col] = label_encoders[col].transform(new_data_point[col])
        except ValueError as e:  # Handle unseen labels gracefully
            print(f"Warning: {e}. Using the most frequent label for {col}.")
            # Handle unseen label by replacing it with the most frequent one
            new_data_point[col] = label_encoders[col].transform([label_encoders[col].classes_[0]])

    new_data_point = scaler.transform(new_data_point)
    predicted_page = model.predict(new_data_point)
    predicted_page_name = label_encoders['page_name'].inverse_transform(predicted_page)
    return predicted_page_name[0]

# Example usage of the enhanced prediction function:
predicted_page = predict_page(model, scaler, label_encoders, 10, 'US', 'CA', 'San Francisco')
print(f"Predicted page name: {predicted_page}")


# Function to analyze the most frequent locations
def analyze_most_frequent_locations(df, top_n=5):
    location_counts = df.groupby(['country', 'region', 'city'])['visit_count'].sum().reset_index()
    location_counts = location_counts.sort_values('visit_count', ascending=False)
    return location_counts.head(top_n)

most_frequent = analyze_most_frequent_locations(abalone_train)
print("\nMost Frequent Locations:")
print(most_frequent)

# Function to predict the most frequent page for a given location
def predict_most_frequent_page_for_location(df, country, region, city):
    location_data = df[(df['country'] == country) & (df['region'] == region) & (df['city'] == city)]
    if location_data.empty:
      return "No data available for this location."
    most_frequent_page = location_data['page_name'].mode().iloc[0]
    return most_frequent_page


# Example usage:
predicted_most_frequent_page = predict_most_frequent_page_for_location(abalone_train, 'US', 'CA', 'San Francisco')
print(f"\nMost Frequent Page for 'US', 'CA', 'San Francisco': {predicted_most_frequent_page}")

Predicted page name: https://enally.in/

Most Frequent Locations:
           country  region       city  \
533        country  region       city   
36          Canada      ON     Guelph   
530  United States      WA    Seattle   
120          India      DL      Delhi   
121          India      DL  New Delhi   

                                           visit_count  
533                                        visit_count  
36                                                   7  
530  6666666611666666661111112111311321211111211121...  
120  5222151151635632114182211111211122118113112231...  
121  5111111111111211111121122112112555411521213213...  

Most Frequent Page for 'US', 'CA', 'San Francisco': No data available for this location.


In [29]:
# prompt: What more and advance we can do with these data, continue doing more stuff

# ... (Your existing code) ...

# Function to analyze the most frequent pages
def analyze_most_frequent_pages(df, top_n=5):
    page_counts = df['page_name'].value_counts().head(top_n)
    return page_counts

most_frequent_pages = analyze_most_frequent_pages(abalone_train)
print("\nMost Frequent Pages:")
print(most_frequent_pages)


# Function to predict pages based on time of day
def predict_page_based_on_time(df, hour_of_day):
    # Convert 'visit_date' to datetime objects, specifying the format
    df['visit_date'] = pd.to_datetime(df['visit_date'], format='%Y-%m-%d %H:%M:%S UTC', errors='coerce')


    # Extract the hour from the 'visit_date'
    df['hour'] = df['visit_date'].dt.hour

    # Filter data based on hour of day
    hourly_data = df[df['hour'] == hour_of_day]
    if hourly_data.empty:
      return "No data available for this hour."
    most_frequent_page_hour = hourly_data['page_name'].mode().iloc[0]
    return most_frequent_page_hour


# Example Usage
predicted_page_hour_10 = predict_page_based_on_time(abalone_train, 10)  # Predict for 10 AM
print(f"\nMost Frequent Page at 10 AM: {predicted_page_hour_10}")

def analyze_trends_over_time(df, time_interval='D'): #Daily trends
    # Convert 'visit_date' to datetime objects, specifying the format
    df['visit_date'] = pd.to_datetime(df['visit_date'], format='%Y-%m-%d %H:%M:%S UTC', errors='coerce')

    # Filter out rows with NaT values in the 'visit_date' column
    df = df.dropna(subset=['visit_date'])

    # Extract the hour from the 'visit_date' (if needed)
    df['hour'] = df['visit_date'].dt.hour

    # Now group by time interval and calculate the sum of 'visit_count'
    trends = df.groupby(pd.Grouper(key='visit_date', freq=time_interval))['visit_count'].sum()
    return trends

trends = analyze_trends_over_time(abalone_train)
print("\nTrends over Time (Daily):")
print(trends)


# Example: Visualize trends over time (example: daily trends)
trends_chart = alt.Chart(trends.reset_index()).mark_line().encode(
    x='visit_date:T',  # Temporal encoding for the x-axis
    y='visit_count:Q'
).properties(
    width=600,
    height=400,
    title="Visit Count Trends over Time"
).interactive()

trends_chart.display()


Most Frequent Pages:
page_name
https://enally.in/                        2699
https://enally.in/services                 893
https://enally.in/projects-list            692
https://enally.in/social-media-content     545
https://enally.in/blogs                    310
Name: count, dtype: int64

Most Frequent Page at 10 AM: No data available for this hour.

Trends over Time (Daily):
Series([], Freq: D, Name: visit_count, dtype: object)


In [31]:
!pip install pydeck

Collecting pydeck
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck
Successfully installed pydeck-0.9.1


In [40]:
import pandas as pd
import folium
from folium.plugins import MarkerCluster
from branca.colormap import LinearColormap

# Convert 'lat', 'lon', and 'visit_count' to numeric, handling errors
abalone_train['lat'] = pd.to_numeric(abalone_train['lat'], errors='coerce')
abalone_train['lon'] = pd.to_numeric(abalone_train['lon'], errors='coerce')
abalone_train['visit_count'] = pd.to_numeric(abalone_train['visit_count'], errors='coerce')

# Filter out rows with invalid 'lat', 'lon', or 'visit_count' values
abalone_train = abalone_train.dropna(subset=['lat', 'lon', 'visit_count'])

# Aggregate visit counts by location
location_counts = abalone_train.groupby(['lat', 'lon'])['visit_count'].sum().reset_index()

# Create a world map
world_map = folium.Map(location=[0, 0], zoom_start=2)

# Create a color map
color_map = LinearColormap(colors=['green', 'yellow', 'orange', 'red'],
                           vmin=location_counts['visit_count'].min(),
                           vmax=location_counts['visit_count'].max())

# Add markers to the map
marker_cluster = MarkerCluster().add_to(world_map)

for idx, row in location_counts.iterrows():
    folium.CircleMarker(
        location=[row['lat'], row['lon']],
        radius=5,
        popup=f"Visits: {row['visit_count']}",
        color=color_map(row['visit_count']),
        fill=True,
        fillColor=color_map(row['visit_count'])
    ).add_to(marker_cluster)

# Add color legend
color_map.add_to(world_map)

# Save the map
world_map.save("world_map_visits.html")

# Display the map (this will work in Jupyter notebooks)
world_map

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abalone_train['lat'] = pd.to_numeric(abalone_train['lat'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abalone_train['lon'] = pd.to_numeric(abalone_train['lon'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abalone_train['visit_count'] = pd.to_numeric(abalone_