In [None]:
import pandas as pd
import datetime

In [None]:
df = pd.read_csv("/content/SHARKTANK.csv")

In [None]:
df.head()

Unnamed: 0,Startup Name,Industry,Business Description,Company Website,Number of Presenters,Pitchers Average Age,Started in,Pitchers State,Yearly Revenue,Monthly Sales,...,Original Ask Amount,Original Offered Equity,Valuation Requested,Received Offer,Accepted Offer,Total Deal Amount,Total Deal Equity,Total Deal Debt,Deal Valuation,Has Patents
0,BluePineFoods,Food,Frozen Momos,https://bluepinefoods.com/,3,Middle,2016.0,Delhi,95.0,8.0,...,50.0,5.0,1000.0,1,1.0,75.0,16.0,,469.0,
1,BoozScooters,Vehicles/Electrical Vehicles,Renting e-bike for mobility in private spaces,https://www.boozup.net/,1,Young,2017.0,Gujarat,4.0,0.4,...,40.0,15.0,267.0,1,1.0,40.0,50.0,,80.0,
2,HeartUpMySleeves,Beauty/Fashion,Detachable Sleeves,https://heartupmysleeves.com/,1,Young,2021.0,Delhi,,2.0,...,25.0,10.0,250.0,1,1.0,25.0,30.0,,83.0,
3,TagzFoods,Food,Healthy Potato Chips Snacks,https://tagzfoods.com/,2,Middle,2019.0,Karnataka,700.0,,...,70.0,1.0,7000.0,1,1.0,70.0,2.75,,2545.0,
4,HeadAndHeart,Education,Brain Development Course,https://thehnh.in/,4,Middle,2015.0,Punjab,30.0,,...,50.0,5.0,1000.0,0,,,,,,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316 entries, 0 to 315
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Startup Name             316 non-null    object 
 1   Industry                 316 non-null    object 
 2   Business Description     316 non-null    object 
 3   Company Website          304 non-null    object 
 4   Number of Presenters     316 non-null    int64  
 5   Pitchers Average Age     316 non-null    object 
 6   Started in               179 non-null    float64
 7   Pitchers State           261 non-null    object 
 8   Yearly Revenue           131 non-null    float64
 9   Monthly Sales            138 non-null    float64
 10  Gross Margin             84 non-null     float64
 11  Net Margin               22 non-null     float64
 12  Original Ask Amount      316 non-null    float64
 13  Original Offered Equity  316 non-null    float64
 14  Valuation Requested      3

In [None]:
# Data Cleaning

# Making 'Company website' & 'Started in' columns relevent to the model
df['website'] = df['Company Website'].notna().astype(int)
current_year = datetime.datetime.now().year
df['years_active'] = current_year - df['Started in']

# Cleaning the null values
# filling 0 in place of NaN
null_columns = ['Yearly Revenue','Monthly Sales','Net Margin','Total Deal Amount','Total Deal Equity','Total Deal Debt','Deal Valuation','Has Patents','Accepted Offer']
df[null_columns] = df[null_columns].fillna(0)

# filling mean margin in Nan for Gross Margin
mean_margin = df['Gross Margin'].mean()
df['Gross Margin'] = df['Gross Margin'].fillna(mean_margin)

# filling median value in Nan for years_active
median_years = df['years_active'].median()
df['years_active'] = df['years_active'].fillna(median_years)

# Changing the datatype
df['Has Patents'] = df['Has Patents'].astype(int)
df.dropna(inplace=True)
# Dropping the irrelevent columns
df = df.drop(['Net Margin','Pitchers Average Age','Company Website','Started in','Pitchers State','Startup Name', 'Business Description','Total Deal Amount','Total Deal Equity','Total Deal Debt','Deal Valuation'],axis=1)

In [None]:
df.head()

Unnamed: 0,Industry,Number of Presenters,Yearly Revenue,Monthly Sales,Gross Margin,Original Ask Amount,Original Offered Equity,Valuation Requested,Received Offer,Accepted Offer,Has Patents,website,years_active
0,Food,3,95.0,8.0,53.535714,50.0,5.0,1000.0,1,1.0,0,1,7.0
1,Vehicles/Electrical Vehicles,1,4.0,0.4,53.535714,40.0,15.0,267.0,1,1.0,0,1,6.0
2,Beauty/Fashion,1,0.0,2.0,53.535714,25.0,10.0,250.0,1,1.0,0,1,2.0
3,Food,2,700.0,0.0,48.0,70.0,1.0,7000.0,1,1.0,0,1,4.0
4,Education,4,30.0,0.0,53.535714,50.0,5.0,1000.0,0,0.0,0,1,8.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 155 entries, 0 to 314
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Industry                 155 non-null    object 
 1   Number of Presenters     155 non-null    int64  
 2   Yearly Revenue           155 non-null    float64
 3   Monthly Sales            155 non-null    float64
 4   Gross Margin             155 non-null    float64
 5   Original Ask Amount      155 non-null    float64
 6   Original Offered Equity  155 non-null    float64
 7   Valuation Requested      155 non-null    float64
 8   Received Offer           155 non-null    int64  
 9   Accepted Offer           155 non-null    float64
 10  Has Patents              155 non-null    int64  
 11  website                  155 non-null    int64  
 12  years_active             155 non-null    float64
dtypes: float64(8), int64(4), object(1)
memory usage: 17.0+ KB


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def train_model(df):
    # One-hot encode 'Industry'
    # df = pd.get_dummies(df, columns=['Industry'])
    
    
    # Drop target variables and split data into features (X) and target variable (y)
    X = df.drop(['Received Offer', 'Accepted Offer','Industry'], axis=1)
    y = df['Received Offer']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Fit the logistic regression model
    lr = LogisticRegression(random_state=42, max_iter=1000)
    lr.fit(X_train, y_train)
    
    # Predict on the test set and calculate accuracy
    y_pred = lr.predict(X_test)
    y_pred = lr.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1-score:', f1)
    
    return lr

In [None]:
def predict_offer(lr):
    # Take input from user

    #industry = input('Enter Industry: ')
    presenters = int(input('Enter Number of Presenters: '))
    revenue = float(input('Enter Yearly Revenue: '))
    # sales = float(input('Enter Monthly Sales: '))
    margin = float(input('Enter Net Margin: '))
    ask_amount = float(input('Enter Original Ask Amount: '))
    offered_equity = float(input('Enter Original Offered Equity: '))
    # valuation = float(input('Enter Valuation Requested: '))
    has_patents = int(input('Enter Has Patents (0: No, 1: Yes): '))
    website = int(input('Enter Website (0: No, 1: Yes): '))
    years_active = float(input('Enter Years Active: '))
    
    # Create a DataFrame from user input
    data = pd.DataFrame({
        # 'Industry_Agriculture': [0],
        # 'Industry_Animal/Pets': [0],
        # 'Industry_Beauty/Fashion': [0],
        # 'Industry_Education': [0],
        # 'Industry_Electronics': [0],
        # 'Industry_Entertainment': [0],
        # 'Industry_Food': [0],
        # 'Industry_Furnishing/Household': [0],
        # 'Industry_Beauty/Fashion': [0],
        # 'Industry_Hardware': [0],
        # 'Industry_Liquor/Beverages': [0],
        # 'Industry_Manufacturing': [0],
        # 'Industry_Medical/Health': [0],
        # 'Industry_Services': [0],
        # 'Industry_Sports': [0],
        # 'Industry_Technology/Software': [0],
        # 'Industry_Vehicles/Electrical Vehicles': [0],
        'Number of Presenters': [presenters],
        'Yearly Revenue': [revenue],
        'Monthly Sales': [(revenue/12)],
        'Gross Margin' : [margin],
        'Original Ask Amount': [ask_amount],
        'Original Offered Equity': [offered_equity],
        'Valuation Requested': [(ask_amount*offered_equity)],
        'Has Patents': [has_patents],
        'website': [website],
        'years_active': [years_active]
    })
    # data['Industry_' + industry] = 1
    
    # Make prediction using the trained logistic regression model
    prediction = lr.predict(data)
    
    if prediction == 1:
        print('This startup is likely to receive an offer.')
    else:
        print('This startup is unlikely to receive an offer.')


In [None]:
# Running the code
lr = train_model(df)
predict_offer(lr)

Accuracy: 0.7419354838709677
Precision: 0.7241379310344828
Recall: 1.0
F1-score: 0.8400000000000001
Enter Number of Presenters: 2
Enter Yearly Revenue: 4500000
Enter Net Margin: 40
Enter Original Ask Amount: 1000000
Enter Original Offered Equity: 10
Enter Has Patents (0: No, 1: Yes): 1
Enter Website (0: No, 1: Yes): 1
Enter Years Active: 2
This startup is likely to receive an offer.


**Inferences**
* The model has an accuracy of 74.2%, which means that it correctly predicts the outcome for 74.2% of the startups in the test set. While this accuracy is decent, it may not be sufficient for certain use cases where higher accuracy is desired.

* The model has a precision of 72.4%, which means that when it predicts that a startup will receive an offer, it is correct 72.4% of the time. This metric is important when the cost of false positives is high, i.e., when it is worse to predict that a startup will receive an offer when it actually won't.

* The model has a recall of 100%, which means that it correctly identifies all the startups that actually receive an offer. This metric is important when the cost of false negatives is high, i.e., when it is worse to predict that a startup won't receive an offer when it actually will.

* The F1-score of the model is 84.0%, which is a weighted average of precision and recall. This metric balances the importance of precision and recall, and is a good measure of overall performance.