In [1]:
import pandas as pd
import plotly.plotly as py
from geopy.distance import vincenty
import math
py.sign_in('Frank1993', 'kgkpACBeHU2lf1yFoWEW')

In [2]:
 class GeoLocation:
    '''
    Class representing a coordinate on a sphere, most likely Earth.
    
    This class is based from the code smaple in this paper:
        http://janmatuschek.de/LatitudeLongitudeBoundingCoordinates
        
    The owner of that website, Jan Philip Matuschek, is the full owner of 
    his intellectual property. This class is simply a Python port of his very
    useful Java code. All code written by Jan Philip Matuschek and ported by me 
    (which is all of this class) is owned by Jan Philip Matuschek.
    '''
 
 
    MIN_LAT = math.radians(-90)
    MAX_LAT = math.radians(90)
    MIN_LON = math.radians(-180)
    MAX_LON = math.radians(180)
    
    EARTH_RADIUS = 6378.1  # kilometers
    
    
    @classmethod
    def from_degrees(cls, deg_lat, deg_lon):
        rad_lat = math.radians(deg_lat)
        rad_lon = math.radians(deg_lon)
        return GeoLocation(rad_lat, rad_lon, deg_lat, deg_lon)
        
    @classmethod
    def from_radians(cls, rad_lat, rad_lon):
        deg_lat = math.degrees(rad_lat)
        deg_lon = math.degrees(rad_lon)
        return GeoLocation(rad_lat, rad_lon, deg_lat, deg_lon)
    
    
    def __init__(
            self,
            rad_lat,
            rad_lon,
            deg_lat,
            deg_lon
    ):
        self.rad_lat = float(rad_lat)
        self.rad_lon = float(rad_lon)
        self.deg_lat = float(deg_lat)
        self.deg_lon = float(deg_lon)
        self._check_bounds()
        
    def __str__(self):
        degree_sign= u'\N{DEGREE SIGN}'
        return ("({0:.4f}deg, {1:.4f}deg) = ({2:.6f}rad, {3:.6f}rad)").format(
            self.deg_lat, self.deg_lon, self.rad_lat, self.rad_lon)
        
    def _check_bounds(self):
        if (self.rad_lat < GeoLocation.MIN_LAT 
                or self.rad_lat > GeoLocation.MAX_LAT 
                or self.rad_lon < GeoLocation.MIN_LON 
                or self.rad_lon > GeoLocation.MAX_LON):
            raise Exception("Illegal arguments")
            
    def distance_to(self, other, radius=EARTH_RADIUS):
        '''
        Computes the great circle distance between this GeoLocation instance
        and the other.
        '''
        return radius * math.acos(
                math.sin(self.rad_lat) * math.sin(other.rad_lat) +
                math.cos(self.rad_lat) * 
                math.cos(other.rad_lat) * 
                math.cos(self.rad_lon - other.rad_lon)
            )
            
    def bounding_locations(self, distance, radius=EARTH_RADIUS):
        '''
        Computes the bounding coordinates of all points on the surface
        of a sphere that has a great circle distance to the point represented
        by this GeoLocation instance that is less or equal to the distance argument.
        
        Param:
            distance - the distance from the point represented by this GeoLocation
                       instance. Must be measured in the same unit as the radius
                       argument (which is kilometers by default)
            
            radius   - the radius of the sphere. defaults to Earth's radius.
            
        Returns a list of two GeoLoations - the SW corner and the NE corner - that
        represents the bounding box.
        '''
        
        if radius < 0 or distance < 0:
            raise Exception("Illegal arguments")
            
        # angular distance in radians on a great circle
        rad_dist = distance / radius
        
        min_lat = self.rad_lat - rad_dist
        max_lat = self.rad_lat + rad_dist
        
        if min_lat > GeoLocation.MIN_LAT and max_lat < GeoLocation.MAX_LAT:
            delta_lon = math.asin(math.sin(rad_dist) / math.cos(self.rad_lat))
            
            min_lon = self.rad_lon - delta_lon
            if min_lon < GeoLocation.MIN_LON:
                min_lon += 2 * math.pi
                
            max_lon = self.rad_lon + delta_lon
            if max_lon > GeoLocation.MAX_LON:
                max_lon -= 2 * math.pi
        # a pole is within the distance
        else:
            min_lat = max(min_lat, GeoLocation.MIN_LAT)
            max_lat = min(max_lat, GeoLocation.MAX_LAT)
            min_lon = GeoLocation.MIN_LON
            max_lon = GeoLocation.MAX_LON
        
        return [ GeoLocation.from_radians(min_lat, min_lon) , 
            GeoLocation.from_radians(max_lat, max_lon) ]

In [3]:
def getPolygon(lat,lon,radius):
    loc = GeoLocation.from_degrees(lat,lon)
    distance = radius
    SW_loc, NE_loc = loc.bounding_locations(distance)
    min_lat,min_lon = SW_loc.deg_lat,SW_loc.deg_lon
    max_lat,max_lon = NE_loc.deg_lat,NE_loc.deg_lon

    return [max_lat,max_lat,min_lat,min_lat,max_lat],[min_lon,max_lon,max_lon,min_lon,min_lon]

In [4]:
df = pd.read_csv('/Users/hu/Desktop/WBData/WBDataGeoOfNeededAds.txt',sep='\t',names = ["AdId",
           "AdTitle",
           "AdLatitude",
           "AdLongitude",
           "AdTargetRadius",
           "AdGeoTargetedLocationID",
           "LocationCity",
           "LocationMetroArea", "Latitude", "Longitude", "Impresions","Clicks","GoodClicks"])

In [5]:
def milesToKm(length):
    return 1.60934*length

In [8]:
def getBoundaryFunc(maxLat,minLat,maxLon,minLon):
    def isInBoundary(row):
        clickLat = row["Latitude"]
        clickLon = row["Longitude"]
        if clickLat < maxLat and clickLat > minLat and clickLon < maxLon and clickLon > minLon:
            return 1
        else:
            return 0
    return isInBoundary

In [6]:
selected = (df["AdLatitude"] != 0) & (df["AdLongitude"] != 0) & (df["AdTargetRadius"]!=0)

df = df[selected]

print df.shape

(163178, 13)


In [None]:
selectedAdIds = ["13788324073" , "11414358196" , "14362219699", "9903837745", "10561381600" , "4061007624" , "7118171456" , "9143254484" , "12229271140" ,
    "12229849372","12229849367","9903837743","9903837741","6786566351","7783206080","10554650655","10733724030","6684573884"]

In [15]:
df[df["AdTitle"]=="jcpenney portrait studios"].head()


Unnamed: 0,AdId,AdTitle,AdLatitude,AdLongitude,AdTargetRadius,AdGeoTargetedLocationID,LocationCity,LocationMetroArea,Latitude,Longitude,Impresions,Clicks,GoodClicks,Color,Opacity,DistanceBetweenClicksAndTarget
115,13788324073,jcpenney portrait studios,47.616,-122.202,30,0,67560.0,71287.0,47.60894,-122.3366,1,0,0,red,0.2,6.307
520,13788324073,jcpenney portrait studios,39.455,-74.642,40,0,,,39.4516,-74.63087,1,0,0,red,0.2,0.64
971,13788324073,jcpenney portrait studios,47.218,-122.468,30,0,67634.0,71287.0,47.21133,-122.5178,1,1,1,blue,0.6,2.389
1222,13788324073,jcpenney portrait studios,39.829,-75.099,40,0,,,39.56617,-75.01582,1,0,0,red,0.2,18.667
1523,13788324073,jcpenney portrait studios,41.097,-73.952,30,0,59041.0,71249.0,41.18848,-73.664,1,1,1,blue,0.6,16.296


In [9]:
"""
def getColor(row):
        impresions = row["Impresions"]
        clicks = row["Clicks"]
        goodClicks = row["GoodClicks"]

        if goodClicks == 1:
            color = "blue"
        elif clicks == 1:
            color = "green"
        else:
            color = "red"

        return color

df["Color"] = df.apply(getColor,axis = 1)

def getOpacity(row):
    impresions = row["Impresions"]
    clicks = row["Clicks"]
    goodClicks = row["GoodClicks"]

    if goodClicks == 1:
        opacity = 0.6
    elif clicks == 1:
        opacity = 0.6
    else:
        opacity = 0.2

    return opacity

df["Opacity"] = df.apply(getOpacity,axis = 1)
"""
def calDistance(row):
    AdLatitude = row["AdLatitude"]
    AdLongitude = row["AdLongitude"]
    Latitude = row["Latitude"]
    Longitude = row["Longitude"]
    return "%.3f"% vincenty((AdLatitude,AdLongitude),(Latitude,Longitude)).miles

df["DistanceBetweenClicksAndTarget"] = df.apply(calDistance,axis=1)


In [11]:
goodAds = [6684573884,10733724030,7783206080,6786566351,12229849367,12229849372,12229271140,4061007624]

In [13]:
for AdId in goodAds:
    
    data = []
    
    AdDataT = df[df["AdId"]==AdId]
    groups=AdDataT.groupby(["AdTitle","AdLatitude","AdLongitude","AdTargetRadius"])
    print AdId
    
    primaryKey = sorted(groups.groups.keys(), key = lambda x: len(groups.groups[x]))[-1]
    groupData = groups.get_group(primaryKey)
    
    if len(groups.groups[primaryKey]) >500:
        groupData = groupData.sample(500,axis=0)
    
    print groupData.shape
    
    
    
    AdTitle,AdLatitude,AdLongitude,AdTargetRadius = primaryKey
    
    boundLat,boundLon = getPolygon(AdLatitude,AdLongitude,milesToKm(AdTargetRadius))
    
    maxLat,minLat = boundLat[0],boundLat[2]
    maxLon,minLon = boundLon[1],boundLon[0]
    
    boundaryFunc = getBoundaryFunc(maxLat,minLat,maxLon,minLon)
    groupData = groupData.copy()
    groupData["InBoundary"] = groupData.apply(boundaryFunc,axis=1)
    
    
    inBoundaryGroupData = groupData[groupData["InBoundary"] == 1]
    outBoundaryGroupData = groupData[groupData["InBoundary"] == 0]
    
    InBoundaryImpresions = inBoundaryGroupData["Impresions"].sum()
    OutBoundaryImpressions = outBoundaryGroupData["Impresions"].sum()
    
    InBoundaryClicks = inBoundaryGroupData["Clicks"].sum()
    InBoundaryGoodClicks = inBoundaryGroupData["GoodClicks"].sum()
    
    outBoundaryClicks = outBoundaryGroupData["Clicks"].sum()
    outBoundaryGoodClicks = outBoundaryGroupData["GoodClicks"].sum()
    
    ImBoundaryImpresionsRate = float(InBoundaryImpresions)/(InBoundaryImpresions+OutBoundaryImpressions)
    
    CTRInBoundary = float(InBoundaryClicks)/InBoundaryImpresions
    GoodCTRInBoundary = float(InBoundaryGoodClicks)/InBoundaryImpresions
    
    CTROutBoundary = float(outBoundaryClicks)/max(1,OutBoundaryImpressions)
    GoodCTROutBoundary = float(outBoundaryGoodClicks)/max(1,OutBoundaryImpressions)
    
    des = """IB impressions rate: %.3f <br> CTR IB:%.3f <br>GoCTR IB:%.3f <br> CTR OB: %.3f <br> GoCTR OB: %.3f"""%(ImBoundaryImpresionsRate,
                                                                                   CTRInBoundary,
                                                                                   GoodCTRInBoundary,
                                                                                   CTROutBoundary,
                                                                                   GoodCTROutBoundary)
    """
    target = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lat = [AdLatitude],
        lon = [AdLongitude],
        
        marker = dict(
            size = 12,
            opacity = 1,
            reversescale = False,
            #autosize = True,
            color = "yellow",
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area',
            symbol = "circle-x",
        ),
        name = 'Target' )
    
    countour = dict(
        type = 'scattergeo',
        lat = boundLat,
        lon = boundLon,
        mode = 'lines',
        text = [des]*5,
        fill = "toself",
        line = dict(
            width = 2,
            color = 'yellow'
        ),
        fillcolor = "rgb(217, 217, 217)",
        name = "TargetBoundary"
    ) 
    clicks = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lat = groupData["Latitude"].tolist(),
        lon = groupData["Longitude"].tolist(),
        text = groupData["DistanceBetweenClicksAndTarget"].tolist(),
        marker = dict(
            size = 5,
            color = groupData["Color"],
            opacity = groupData["Opacity"],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = 'Clicks'
    )
    
    data.append(countour)
    
    data.append(target)
    data.append(clicks)
    
    
    

    layout = dict(
        title = 'Geo Distribution of Ads:<br> %s'%AdTitle,
        showlegend = True,
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = "rgb(250, 250, 250)",
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(24, 11, 48)",
            countrycolor="rgb(24, 11, 48)"
        ),
    )
    
    print AdTitle
    
    print des
    fig = dict( data=data, layout=layout )
    py.iplot( fig, validate=False, filename='Geo Distribution of Ads: %s'%AdTitle )
    """
    print AdTitle
    
    print des


6684573884
(500, 16)
denver mattress®
IB impressions rate: 0.916 <br> CTR IB:0.098 <br>GoCTR IB:0.076 <br> CTR OB: 0.167 <br> GoCTR OB: 0.143
10733724030
(500, 16)
luxury pre owned duluth
IB impressions rate: 1.000 <br> CTR IB:0.006 <br>GoCTR IB:0.002 <br> CTR OB: 0.000 <br> GoCTR OB: 0.000
7783206080
(500, 16)
certified used hyundai
IB impressions rate: 0.980 <br> CTR IB:0.037 <br>GoCTR IB:0.024 <br> CTR OB: 0.000 <br> GoCTR OB: 0.000
6786566351
(500, 16)
sands casino bethlehem
IB impressions rate: 0.314 <br> CTR IB:0.382 <br>GoCTR IB:0.344 <br> CTR OB: 0.350 <br> GoCTR OB: 0.324
12229849367
(489, 16)
carmax® official site
IB impressions rate: 1.000 <br> CTR IB:0.528 <br>GoCTR IB:0.501 <br> CTR OB: 0.000 <br> GoCTR OB: 0.000
12229849372
(362, 16)
carmax® stores near you
IB impressions rate: 1.000 <br> CTR IB:0.486 <br>GoCTR IB:0.434 <br> CTR OB: 0.000 <br> GoCTR OB: 0.000
12229271140
(474, 16)
carmax® stores near you
IB impressions rate: 1.000 <br> CTR IB:0.435 <br>GoCTR IB:0.409 <br>

In [None]:
AdData = df[df["AdId"]==6684573884]

In [None]:

data = []

for primaryKey,groupData in groups:
    AdTitle,AdLatitude,AdLongitude,AdTargetRadius = primaryKey
    
    boundLat,boundLon = getPolygon(AdLatitude,AdLongitude,milesToKm(AdTargetRadius))
    
    maxLat,minLat = boundLat[0],boundLat[2]
    maxLon,minLon = boundLon[1],boundLon[0]
    
    boundaryFunc = getBoundaryFunc(maxLat,minLat,maxLon,minLon)
    groupData = groupData.copy()
    groupData["InBoundary"] = groupData.apply(boundaryFunc,axis=1)
    
    
    inBoundaryGroupData = groupData[groupData["InBoundary"] == 1]
    outBoundaryGroupData = groupData[groupData["InBoundary"] == 0]
    
    InBoundaryImpresions = inBoundaryGroupData["Impresions"].sum()
    OutBoundaryImpressions = outBoundaryGroupData["Impresions"].sum()
    
    InBoundaryClicks = inBoundaryGroupData["Clicks"].sum()
    InBoundaryGoodClicks = inBoundaryGroupData["GoodClicks"].sum()
    
    outBoundaryClicks = outBoundaryGroupData["Clicks"].sum()
    outBoundaryGoodClicks = outBoundaryGroupData["GoodClicks"].sum()
    
    ImBoundaryImpresionsRate = float(InBoundaryImpresions)/(InBoundaryImpresions+OutBoundaryImpressions)
    
    CTRInBoundary = float(InBoundaryClicks)/InBoundaryImpresions
    GoodCTRInBoundary = float(InBoundaryGoodClicks)/InBoundaryImpresions
    
    CTROutBoundary = float(outBoundaryClicks)/OutBoundaryImpressions
    GoodCTROutBoundary = float(outBoundaryGoodClicks)/OutBoundaryImpressions
    
    des = """IB impressions rate: %s <br> CTR IB:%s <br>GoCTR IB:%s <br> CTR OB: %s <br> GoCTR OB: %s"""%(ImBoundaryImpresionsRate,
                                                                                   CTRInBoundary,
                                                                                   GoodCTRInBoundary,
                                                                                   CTROutBoundary,
                                                                                   GoodCTROutBoundary)
    
    target = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lat = [AdLatitude],
        lon = [AdLongitude],
        
        marker = dict(
            size = 8,
            opacity = 1,
            #autosize = True,
            color = "red",
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area',
            text = "Target",
            symbol = "circle-x",
        ),
        name = 'Target' )
    
    countour = dict(
        type = 'scattergeo',
        lat = boundLat,
        lon = boundLon,
        mode = 'lines',
        text = [des]*5,
        fill = "toself",
        line = dict(
            width = 2,
            color = 'yellow'
        ),
        fillcolor = "rgb(217, 217, 217)",
        name = "TargetBoundary"
    ) 
    clicks = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lat = groupData["Latitude"].tolist(),
        lon = groupData["Longitude"].tolist(),
        text = groupData["DistanceBetweenClicksAndTarget"].tolist(),
        marker = dict(
            size = 5,
            color = groupData["Color"],
            opacity = groupData["Opacity"],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = 'Clicks'
    )
    
    data.append(countour)
    
    data.append(target)
    data.append(clicks)
    
    
    

layout = dict(
        title = 'Geo Distribution of Ads:<br> %s'%AdTitle,
        showlegend = True,
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = "rgb(250, 250, 250)",
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(24, 11, 48)",
            countrycolor="rgb(24, 11, 48)"
        ),
    )

fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='Geo Distribution of Ads: %s'%AdTitle )
