In [None]:
import io
import pandas as pd
import tkinter as tk
from tkinter import filedialog
import requests
from io import BytesIO
import os
import difflib
import re
from urllib.parse import urlparse
import openai
from openai import OpenAI
import validators
import gdown
from urllib.parse import parse_qs
def cleaning(data):
        clean_col_names = []
        for col in data.columns:
            clean_col = re.sub(r'[_\-]', ' ', col)
            clean_col = re.sub(r'(?<!^)(?=[A-Z])', ' ', col)
            clean_col = clean_col.strip()
            clean_col = clean_col.replace('_', ' ')
            clean_col = re.sub(r'\s+', ' ', clean_col)
            clean_col = clean_col.lower()# Improved regex for better name cleaning
            clean_col_names.append(clean_col)
        data.columns = clean_col_names
        return data
def extract_file_id(drive_link):
        """Extracts the file ID from a Google Drive link."""
        file_id = None
        try:
            parsed = urlparse(drive_link)
            file_id = drive_link.split("/")[5]
            filename = os.path.basename(parsed.path)
        except IndexError:
            print("Invalid Google Drive link provided.")
        return file_id

def download_file(file_id):
        """Downloads the file from Google Drive."""
        download_url = f"https://drive.google.com/uc?id={file_id}"
        response = requests.get(download_url)
        return response.content

def load_data():
        print("How would you like to input the data?")
        print("1. Browse from system")
        print("2. Provide a link to online data")
        choice = input("Enter your choice (1 or 2): ")

        if choice == '1':
            root = tk.Tk()
            root.withdraw()  # Hide the main window
            file_path = filedialog.askopenfilename()  # Open file dialog

            if file_path:
                try:
                    # Detect file type and read accordingly
                    file_ext = os.path.splitext(file_path)[-1].lower()
                    if file_ext == '.csv' or file_ext in ['.xls', '.xlsx']:
                        if file_ext == '.csv':
                            with open(file_path, 'r') as file:
                                first_line = file.readline()
                                separator = re.search("[,;\t]", first_line).group()
                            data = pd.read_csv(file_path, sep=separator)
                        else:
                            data = pd.read_excel(file_path)
                    else:
                        print("Unsupported file format. Please provide a CSV, XLS, or XLSX file.")
                        return None, None,None
                    # Check if data has any columns
                    if len(data.columns) == 0:
                        print("Error: The file contains no columns.")
                        return None, None,None

                    # Clean column names
                    data = cleaning(data)

                    # Extract filename and get dataset name confirmation
                    filename = os.path.basename(file_path)
                    dataset_name = filename
                    #confirm_name = input(f"Is '{filename}' the intended dataset name? (yes/no): ")
                    #dataset_name = filename if confirm_name.lower() == 'yes' else input("Enter the dataset name: ")
                    #target_variable, data = targeted_variable(data)  # Capture the returned value from target_variable function
                    target_variable = get_target_variables(data, dataset_name)
                    return data, dataset_name, target_variable
                    

                except Exception as e:
                    print("Error:", e)
                    return None, None,None

            else:
                print("No file selected.")
                return None, None,None

        elif choice == '2':
            url = input("Enter the URL of the online data: ")
            if not validators.url(url):
                print("Invalid URL. Please check the URL and try again.")
                return None, None,None
            try:
                data = None
                if 'drive.google.com' in url:
                    # Google Drive URL
                    try:
                        file_id = extract_file_id(url)
                        if file_id is not None:
                            file_content = download_file(file_id)
                            # Try reading the file
                            try:
                                data = pd.read_csv(io.StringIO(file_content.decode('utf-8')))
                            except pd.errors.ParserError:
                                try:
                                    data = pd.read_excel(io.BytesIO(file_content))
                                except Exception as e:
                                    print("Unsupported file format. Please provide a CSV, XLS, or XLSX file.")
                                    return None, None,None
                        else:
                            print("Invalid Google Drive URL. The URL should contain an 'id' parameter.")
                            return None, None,None
                    except Exception as e:
                        print("Error:", e)
                        print("An error occurred. Please check the URL and try again.")
                        return None, None,None

                else:
                    # Other URL
                    response = requests.get(url)
                    if url.endswith('.csv'):
                        data = pd.read_csv(BytesIO(response.content))
                    elif url.endswith('.xls') or url.endswith('.xlsx'):
                        data = pd.read_excel(BytesIO(response.content))
                    else:
                        print("Unsupported file format. Please provide a CSV, XLS, or XLSX file.")
                        return None, None,None
                # Clean column names
                data = cleaning(data)

                # Check if data has any columns
                if len(data.columns) == 0:
                    print("Error: The online data contains no columns.")
                    return None, None,None
                path = urlparse(url).path
                filename = os.path.basename(path)
                dataset_name = filename
                target_variable = get_target_variables(data, dataset_name)
                return data, dataset_name, target_variable
            except Exception as e:
                print("Error:", e)
                print("An error occurred. Please check the URL and try again.")
                return None, None,None
        else:
            print("No file selected.")
            return None, None,None
        
###

def get_target_variables(data,dataset_name):
                # Load the dataset
                # Get basic information about the dataset
                df = pd.DataFrame(data)
                num_records = len(df)
                num_features = len(df.columns)
                feature_names = [str(name) for name in df.columns.tolist()]  # Convert feature names to strings
                data_types = df.dtypes.tolist()
                data_types_str = [str(dtype) for dtype in data_types]
                missing_values = df.isnull().sum().tolist()
                dataset_shape = df.shape       
                
                # Generate a brief introduction using GPT API
                prompt = f"this is a dataset of {dataset_name} and overall shape of dataset is {dataset_shape}.The dataset contains {num_records} records and {num_features} features. The features include: {', '.join(feature_names)}. The data types of features are: {', '.join(map(str, data_types_str))}."
                prompt += f"Here are the summary statistics:\n{df.describe(include='all')}"
                prompt += f" Please provide most possible target varaible (only one) of dataset return in a only a single string."
                response = client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[{"role": "system", "content": prompt}],
                    max_tokens=200,
                    temperature=0.7,
                    top_p=1.0,
                    frequency_penalty=0.0,
                    presence_penalty=0.0
                )
                # Print the generated introduction
                target_variable = (response.choices[0].message.content)
                # If the response is not a string, convert it to a string
                if not isinstance(target_variable, str):
                    target_variable = str(target_variable)
                if target_variable:
                    print("Target variables found:", target_variable)
                else:
                    print("No target variables found.")
                    target_variable = None
                return target_variable
data, dataset_name, target_variable = load_data()
print(data)
print(dataset_name)
print(target_variable)
df = pd.DataFrame(data)  

In [None]:
class BivariateAnalyzer1:
    def __init__(self, df, dataset_name):
        self.df = df
        self.dataset_name = dataset_name

    def analyze(self):
        analysis_results = {}
        for column1 in self.df.columns:
            for column2 in self.df.columns:
                if column1 != column2:
                    result = self.analyze_columns(column1, column2)
                    if result is not None:
                        analysis_results[(column1, column2)] = result
        return analysis_results

    def analyze_columns(self, column1, column2):
        series1 = self.df[column1]
        series2 = self.df[column2]
        if pd.api.types.is_numeric_dtype(series1) and pd.api.types.is_numeric_dtype(series2):
            correlation = series1.corr(series2)
            return {'correlation': correlation}
        elif pd.api.types.is_numeric_dtype(series1) and pd.api.types.is_categorical_dtype(series2):
            grouped_mean = series1.groupby(series2).mean()
            return {'grouped_mean': grouped_mean}
        elif pd.api.types.is_categorical_dtype(series1) and pd.api.types.is_numeric_dtype(series2):
            grouped_mean = series2.groupby(series1).mean()
            return {'grouped_mean': grouped_mean}
        elif pd.api.types.is_categorical_dtype(series1) and pd.api.types.is_categorical_dtype(series2):
            unique_combinations = self.df.groupby([series1.name, series2.name]).size()
            return {'unique_combinations': unique_combinations}

analyzer = BivariateAnalyzer1(df, 'dataset_name')
analysis_results = analyzer.analyze()

In [None]:
import random
import warnings
warnings.filterwarnings("ignore")

def bi_poss_corr(df, dataset_name, target_variable, analysis_results):
    dataset_columns = df.columns.tolist()
    data_types = df.dtypes.tolist()
    max_unique_values = 10
    unique_counts = df.nunique()
    object_columns = {}
    length = len(df.columns)
    length = 2 * length
    for col in df.columns:
        unique_values = df[col].unique().tolist()
        if len(unique_values) > max_unique_values:
            unique_values = random.sample(unique_values, max_unique_values)
            object_columns[col] = unique_values
        prompt = (
            f"In the {dataset_name} dataset, perform a bivariate analysis with the target variable '{target_variable}'. "
            f"Return a dictionary of at least {length} column pairs, where each pair of columns should be the target variable '{target_variable}'. "
            f"Use the correlations {analysis_results} to select pairs. "
            f"Include pairs with the most positive, most negative, and neutral correlations. "
            f"Format each pair as 'column1':'column2'. "
            f"Use full, case-sensitive, and unique variable names. "
            f"Avoid syntax errors and do not include the correlation value in the output. "
            f"Focus on correlation (most positive, most negative and balanced) the target variable with as many other variables as possible."
            f"each column will pair with at least 3 other columns except target_variable"
            f"Output format of your will be a dictionary with the format {{\column\ : \column\, \column\ : \column\}} only."
            )
        print(prompt)
        response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "system", "content": prompt}],
                max_tokens=2000,
                temperature=0.7,
                top_p=0.7,
                frequency_penalty=0.0,
                presence_penalty=0.0
            )
            
            # Parse the generated response into dictionary format
        response_content = response.choices[0].message.content
        try:
            # Remove the outer double quotes and newline characters
            response_content = response_content.strip('"\n')
            # Replace curly braces with square brackets to convert dictionary to list
            response_content = response_content.replace('{', '[').replace('}', ']')
            # Split the string into lines
            lines = response_content.split(',')
            bi_columns = [{line.strip().split(':')[0].strip().strip('"\n[').strip("'"): line.strip().split(':')[1].strip().strip('"\n]').strip("'")} for line in lines if ':' in line]            for i in range(len(bi_columns)):
                for key, value in bi_columns[i].items():
                    # Remove the quotes from the key and value
                    new_key = key.strip("'")
                    new_value = value.strip("'")
                    # Update the dictionary with the new key and value
                    bi_columns[i] = {new_key: new_value}            
            return bi_columns
        except Exception as e:
            print("Error:", e)
            return None
bi_columns = bi_poss_corr(df,dataset_name,target_variable,analysis_results)

In [None]:
def bi_analyze(df, dataset_name, bi_columns, analysis_results):
    bi_descriptions = {} 
    max_unique_values = 10
    object_columns={}
    for col in df.columns:
                unique_values = df[col].unique().tolist()
                if len(unique_values) > max_unique_values:
                    unique_values = random.sample(unique_values, max_unique_values)
                    object_columns[col] = unique_values
    for bi_column in bi_columns:
        for column1, column2 in bi_column.items():
            uni1 = object_columns.get(column1)
            uni2 = object_columns.get(column2)
            stats = analysis_results.get((column1, column2), {})
            prompt = f"these are unique values in {column1} {uni1} and {column2} {uni2} in the dataset. Please generate a description only about the relationship (for bivariate analysis) between {column1} and {column2}  by using {stats} in simple words or in natural language for my Graph. Start with explaining their relationship and how to they are related only and don't describe about dataset starts with the relationship between these columns are tend to be like that. Don't return all the unique values of columns that are given thats are only for reference. "
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "system", "content": prompt}],
                max_tokens=300,
                temperature=0.2,
                top_p=1.0,
                frequency_penalty=0.0,
                presence_penalty=0.0
            )
            
            bi_descriptions[(column1, column2)] = response.choices[0].message.content
            

    return bi_descriptions

In [None]:
bi_gpt = bi_analyze(df, dataset_name, bi_columns, analysis_results)

In [None]:
bi_gpt

In [None]:
class BivariateAnalyzer:
    def __init__(self, df, bi_gpt, bi_columns):
        self.df = df
        self.bi_gpt = bi_gpt
        self.bi_columns = bi_columns

    def visualize(self):
        with PdfPages('Bi_variate_output.pdf') as pdf:
            for column_pair_dict in self.bi_columns:
                for column_pair in column_pair_dict.items():
                    fig, axs = plt.subplots(2, 1, figsize=(6, 4))
                    column1, column2 = column_pair
                    print(f"Processing columns: {column1}, {column2}")  # Debug print
                    if pd.api.types.is_numeric_dtype(self.df[column1]) and pd.api.types.is_numeric_dtype(self.df[column2]):
                        print(f"Creating scatterplot for {column1} and {column2}")  # Debug print
                        sns.scatterplot(data=self.df, x=column1, y=column2, ax=axs[0])
                        axs[0].set_title(f"Relationship between {column1} and {column2}")
                    elif pd.api.types.is_numeric_dtype(self.df[column1]) and pd.api.types.is_categorical_dtype(self.df[column2]):
                        print(f"Creating boxplot for {column1} (numeric) and {column2} (categorical)")  # Debug print
                        sns.boxplot(x=column2, y=column1, data=self.df, ax=axs[0])
                        axs[0].set_title(f"Relationship between {column1} (numeric) and {column2} (categorical)")
                    elif pd.api.types.is_categorical_dtype(self.df[column1]) and pd.api.types.is_numeric_dtype(self.df[column2]):
                        print(f"Creating boxplot for {column1} (categorical) and {column2} (numeric)")  # Debug print
                        sns.boxplot(x=column1, y=column2, data=self.df, ax=axs[0])
                        axs[0].set_title(f"Relationship between {column1} (categorical) and {column2} (numeric)")
                    else:
                        print(f"Creating countplot for {column1} and {column2} (both categorical)")  # Debug print
                        sns.countplot(x=column1, hue=column2, data=self.df, ax=axs[0])
                        axs[0].set_title(f"Relationship between {column1} and {column2} (both categorical)")

                    axs[1].text(0.5, 0.5, self.bi_gpt[(column1, column2)], wrap=True, horizontalalignment='center', verticalalignment='center', fontsize=8)
                    axs[1].axis('off')  # Hide the axes

                    pdf.savefig(fig)  # saves the current figure into a pdf page
                    plt.close()

In [None]:
import tkinter as tk
from tkinter import messagebox
from tkcalendar import DateEntry
from datetime import datetime, time

def calculate():
    try:
        principal = float(principal_entry.get())
        rate = float(rate_entry.get())  # This is a monthly rate
        date = datetime.combine(date_entry.get_date(), time.min)
        end_date = datetime.now()  # get current date and time
        total_days = (end_date - date).days

        years = total_days // 365
        remaining_days = total_days % 365
        months = remaining_days // 30
        days = remaining_days % 30

        amount = principal
        for i in range(years):
            yearly_interest = 0
            for j in range(12):  # calculate interest for each month in a year
                yearly_interest += amount * rate / 100
            amount += yearly_interest  # add the yearly interest to the principal
            print(f"The amount after {i+1} years is {amount}")

        if months > 0:
            monthly_interest = amount * rate / 100 * months  # calculate interest for remaining months
            amount += monthly_interest

        if days > 0:
            daily_interest = amount * rate / 100 / 30 * days  # calculate interest for remaining days
            amount += daily_interest

        print(f"The amount after {years} years, {months} months and {days} days is {amount}")

    except Exception as e:
        messagebox.showerror("Error", str(e))

root = tk.Tk()

principal_label = tk.Label(root, text="Principal:")
principal_label.pack()
principal_entry = tk.Entry(root)
principal_entry.pack()

rate_label = tk.Label(root, text="Monthly Interest Rate (%):")
rate_label.pack()
rate_entry = tk.Entry(root)
rate_entry.pack()

date_label = tk.Label(root, text="Date:")
date_label.pack()
date_entry = DateEntry(root)
date_entry.pack()

calculate_button = tk.Button(root, text="Calculate", command=calculate)
calculate_button.pack()

root.mainloop()

In [7]:
from flask import Flask, render_template, request
from datetime import datetime, time

app = Flask(__name__)

@app.route('/', methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        try:
            principal = float(request.form.get('principal'))
            rate = float(request.form.get('rate'))  # This is a monthly rate
            date = datetime.strptime(request.form.get('date'), '%Y-%m-%d')
            end_date = datetime.now()  # get current date and time
            total_days = (end_date - date).days

            years = total_days // 365
            remaining_days = total_days % 365
            months = remaining_days // 30
            days = remaining_days % 30

            amount = principal
            for i in range(years):
                yearly_interest = 0
                for j in range(12):  # calculate interest for each month in a year
                    yearly_interest += amount * rate / 100
                amount += yearly_interest  # add the yearly interest to the principal

            if months > 0:
                monthly_interest = amount * rate / 100 * months  # calculate interest for remaining months
                amount += monthly_interest

            if days > 0:
                daily_interest = amount * rate / 100 / 30 * days  # calculate interest for remaining days
                amount += daily_interest

            result = f"The amount after {years} years, {months} months and {days} days is {amount}"
            return render_template('index.html', result=result)

        except Exception as e:
            return render_template('index.html', error=str(e))

    return render_template('index.html')

if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1

In [None]:
from IPython.display import display, HTML
from threading import Timer
from werkzeug.serving import run_simple

def run_in_background(app):
    Timer(1, lambda: run_simple('localhost', 5000, app)).start()
    return app

run_in_background(app)

<Flask '__main__'>

 * Running on http://localhost:5000
Press CTRL+C to quit
127.0.0.1 - - [15/Apr/2024 06:15:15] "GET / HTTP/1.1" 500 -
Error on request:
Traceback (most recent call last):
  File "c:\Users\Anuj Kesharwani\AppData\Local\Programs\Python\Python310\lib\site-packages\werkzeug\serving.py", line 333, in run_wsgi
    execute(self.server.app)
  File "c:\Users\Anuj Kesharwani\AppData\Local\Programs\Python\Python310\lib\site-packages\werkzeug\serving.py", line 320, in execute
    application_iter = app(environ, start_response)
  File "c:\Users\Anuj Kesharwani\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 2551, in __call__
    return self.wsgi_app(environ, start_response)
  File "c:\Users\Anuj Kesharwani\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 2531, in wsgi_app
    response = self.handle_exception(e)
  File "c:\Users\Anuj Kesharwani\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 2528, in wsgi_app
   